### Eclipse Workspace Patch 1.0 #P lucene_trunk Index: src/test/org/apache/lucene/index/values/TestIndexValues.java =================================================================== --- src/test/org/apache/lucene/index/values/TestIndexValues.java (revision 0) +++ src/test/org/apache/lucene/index/values/TestIndexValues.java (revision 0) @@ -0,0 +1,658 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.ValuesField; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LogDocMergePolicy; +import org.apache.lucene.index.LogMergePolicy; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.values.Reader.SortedSource; +import org.apache.lucene.index.values.Reader.Source; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; + +public class TestIndexValues extends LuceneTestCase { + + public void testBytesStraight() throws IOException { + runTestBytes(Bytes.Mode.STRAIGHT, true); + runTestBytes(Bytes.Mode.STRAIGHT, false); + } + + public void testBytesDeref() throws IOException { + runTestBytes(Bytes.Mode.DEREF, true); + runTestBytes(Bytes.Mode.DEREF, false); + } + + public void testBytesSorted() throws IOException { + runTestBytes(Bytes.Mode.SORTED, true); + runTestBytes(Bytes.Mode.SORTED, false); + } + + // nocommit -- for sorted test, do our own Sort of the + // values and verify it's identical + public void runTestBytes(final Bytes.Mode mode, + final boolean fixedSize) throws IOException { + + final BytesRef bytesRef = new BytesRef(); + + final Comparator comp = mode == Bytes.Mode.SORTED ? BytesRef + .getUTF8SortedAsUnicodeComparator() + : null; + + Directory dir = newDirectory(); + Writer w = Bytes + .getWriter(dir, "test", mode, comp, fixedSize); + int maxDoc = 220; + final String[] values = new String[maxDoc]; + final int lenMin, lenMax; + if (fixedSize) { + lenMin = lenMax = 3 + random.nextInt(7); + } else { + lenMin = 1; + lenMax = 15 + random.nextInt(6); + } + for (int i = 0; i < 100; i++) { + final String s; + if (i > 0 && random.nextInt(5) <= 2) { + // use prior value + s = values[2 * random.nextInt(i)]; + } else { + s = _TestUtil.randomUnicodeString(random, lenMin, lenMax); + } + values[2 * i] = s; + + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytesRef); + w.add(2 * i, bytesRef); + } + w.finish(maxDoc); + + Reader r = Bytes.getReader(dir, "test", mode, fixedSize, maxDoc); + for (int iter = 0; iter < 2; iter++) { + ValuesEnum bytesEnum = r.getEnum(); + assertNotNull("enum is null", bytesEnum); + ValuesAttribute attr = bytesEnum.addAttribute(ValuesAttribute.class); + assertNotNull("attribute is null", attr); + BytesRef ref = attr.bytes(); + assertNotNull("BytesRef is null - enum not initialized to use bytes", attr); + + for (int i = 0; i < 2; i++) { + final int idx = 2 * i; + assertEquals("doc: " + idx, idx, bytesEnum.advance(idx)); + String utf8String = ref.utf8ToString(); + assertEquals("doc: " + idx + " lenLeft: " + values[idx].length() + " lenRight: " + utf8String.length() , values[idx], utf8String); + } + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc)); + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc+1)); + + bytesEnum.close(); + } + + + // Verify we can load source twice: + for (int iter = 0; iter < 2; iter++) { + Source s; + Reader.SortedSource ss; + if (mode == Bytes.Mode.SORTED) { + s = ss = r.loadSorted(comp); + } else { + s = r.load(); + ss = null; + } + + for (int i = 0; i < 100; i++) { + final int idx = 2 * i; + assertNotNull("doc " + idx + "; value=" + values[idx], s.bytes(idx)); + assertEquals("doc " + idx, values[idx], s.bytes(idx).utf8ToString()); + if (ss != null) { + assertEquals("doc " + idx, values[idx], ss.getByOrd(ss.ord(idx)) + .utf8ToString()); + Reader.SortedSource.LookupResult result = ss.getByValue(new BytesRef( + values[idx])); + assertTrue(result.found); + assertEquals(ss.ord(idx), result.ord); + } + } + + // Lookup random strings: + if (mode == Bytes.Mode.SORTED) { + final int numValues = ss.getValueCount(); + for (int i = 0; i < 1000; i++) { + BytesRef bytesValue = new BytesRef(_TestUtil.randomUnicodeString( + random, lenMin, lenMax)); + SortedSource.LookupResult result = ss.getByValue(bytesValue); + if (result.found) { + assert result.ord > 0; + assertTrue(bytesValue.bytesEquals(ss.getByOrd(result.ord))); + int count = 0; + for (int k = 0; k < 100; k++) { + if (bytesValue.utf8ToString().equals(values[2 * k])) { + assertEquals(ss.ord(2 * k), result.ord); + count++; + } + } + assertTrue(count > 0); + } else { + assert result.ord >= 0; + if (result.ord == 0) { + final BytesRef firstRef = ss.getByOrd(1); + // random string was before our first + assertTrue(firstRef.compareTo(bytesValue) > 0); + } else if (result.ord == numValues) { + final BytesRef lastRef = ss.getByOrd(numValues); + // random string was after our last + assertTrue(lastRef.compareTo(bytesValue) < 0); + } else { + // random string fell between two of our values + final BytesRef before = (BytesRef) ss.getByOrd(result.ord) + .clone(); + final BytesRef after = ss.getByOrd(result.ord + 1); + assertTrue(before.compareTo(bytesValue) < 0); + assertTrue(bytesValue.compareTo(after) < 0); + + } + } + } + } + } + + r.close(); + dir.close(); + } + + public void testInts() throws IOException { + long maxV = 1; + final int NUM_VALUES = 1000; + final long[] values = new long[NUM_VALUES]; + for (int rx = 1; rx < 63; rx++, maxV *= 2) { + for (int b = 0; b < 2; b++) { + Directory dir = newDirectory(); + boolean useFixedArrays = b == 0; + Writer w = Ints.getWriter(dir, "test", useFixedArrays); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = random.nextLong() % (1 + maxV); + values[i] = v; + w.add(i, v); + } + final int additionalDocs = 1 + random.nextInt(9); + w.finish(NUM_VALUES + additionalDocs); + + Reader r = Ints.getReader(dir, "test", useFixedArrays); + for (int iter = 0; iter < 2; iter++) { + Source s = r.load(); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = s.ints(i); + assertEquals("index " + i + " b: " + b, values[i], v); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = r.getEnum(); + ValuesAttribute attr = iEnum.addAttribute(ValuesAttribute.class); + LongsRef ints = attr.ints(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(i, iEnum.nextDoc()); + assertEquals(values[i], ints.get()); + } + for (int i = NUM_VALUES; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.nextDoc()); + assertEquals("" + i, 0, ints.get()); + } + + iEnum.close(); + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = r.getEnum(); + ValuesAttribute attr = iEnum.addAttribute(ValuesAttribute.class); + LongsRef ints = attr.ints(); + for (int i = 0; i < NUM_VALUES; i += 1 + random.nextInt(25)) { + assertEquals(i, iEnum.advance(i)); + assertEquals(values[i], ints.get()); + } + for (int i = NUM_VALUES; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.advance(i)); + assertEquals("" + i, 0, ints.get()); + } + + iEnum.close(); + } + r.close(); + dir.close(); + } + } + } + + public void testFloats4() throws IOException { + runTestFloats(4, 0.00001); + } + + private void runTestFloats(int precision, double delta) + throws IOException { + Directory dir = newDirectory(); + Writer w = Floats.getWriter(dir, "test", precision); + final int NUM_VALUES = 1000; + final double[] values = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + final double v = precision == 4 ? random.nextFloat() : random.nextDouble(); + values[i] = v; + w.add(i, v); + } + final int additionalValues = 1 + random.nextInt(10); + w.finish(NUM_VALUES + additionalValues); + + Reader r = Floats.getReader(dir, "test", NUM_VALUES + + additionalValues); + for (int iter = 0; iter < 2; iter++) { + Source s = r.load(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(values[i], s.floats(i), 0.0f); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = r.getEnum(); + ValuesAttribute attr = fEnum.addAttribute(ValuesAttribute.class); + FloatsRef floats = attr.floats(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(i, fEnum.nextDoc()); + assertEquals(values[i], floats.get(), delta); + } + for(int i = NUM_VALUES; i < NUM_VALUES + additionalValues; i++) { + assertEquals(i, fEnum.nextDoc()); + assertEquals(0.0, floats.get(), delta); + } + fEnum.close(); + } + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = r.getEnum(); + ValuesAttribute attr = fEnum.addAttribute(ValuesAttribute.class); + FloatsRef floats = attr.floats(); + for (int i = 0; i < NUM_VALUES; i += 1 + random.nextInt(25)) { + assertEquals(i, fEnum.advance(i)); + assertEquals(values[i], floats.get(), delta); + } + for(int i = NUM_VALUES; i < NUM_VALUES + additionalValues; i++) { + assertEquals(i, fEnum.advance(i)); + assertEquals(0.0, floats.get(), delta); + } + fEnum.close(); + } + + r.close(); + dir.close(); + } + + public void testFloats8() throws IOException { + runTestFloats(8, 0.0); + } + + /** + * Tests complete indexing of {@link Values} including deletions, merging and + * sparse value fields on Compound-File + */ + public void testCFSIndex() throws IOException { + // without deletions + IndexWriterConfig cfg = writerConfig(true); + // primitives - no deletes + runTestNumerics(cfg,false); + + cfg = writerConfig(true); + // bytes - no deletes + runTestIndexBytes(cfg, false); + + // with deletions + cfg = writerConfig(true); + // primitives + runTestNumerics(cfg, true); + + cfg = writerConfig(true); + // bytes + runTestIndexBytes(cfg, true); + } + + /** + * Tests complete indexing of {@link Values} including deletions, merging and + * sparse value fields on None-Compound-File + */ + public void testIndex() throws IOException { + // + // without deletions + IndexWriterConfig cfg = writerConfig(false); + // primitives - no deletes + runTestNumerics(cfg, false); + + cfg = writerConfig(false); + // bytes - no deletes + runTestIndexBytes(cfg, false); + + // with deletions + cfg = writerConfig(false); + // primitives + runTestNumerics(cfg, true); + + cfg = writerConfig(false); + // bytes + runTestIndexBytes(cfg, true); + } + + private IndexWriterConfig writerConfig(boolean useCompoundFile) { + final IndexWriterConfig cfg = newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer()); + MergePolicy mergePolicy = cfg.getMergePolicy(); + if(mergePolicy instanceof LogMergePolicy) { + ((LogMergePolicy)mergePolicy).setUseCompoundFile(useCompoundFile); + } else if(useCompoundFile) { + LogMergePolicy policy = new LogDocMergePolicy(); + policy.setUseCompoundFile(useCompoundFile); + cfg.setMergePolicy(policy); + } + return cfg; + } + + public void runTestNumerics(IndexWriterConfig cfg, + boolean withDeletions) throws IOException { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final int numValues = 350; + final List numVariantList = new ArrayList(NUMERICS); + + // run in random order to test if fill works correctly during merges + Collections.shuffle(numVariantList, random); + for (Values val : numVariantList) { + OpenBitSet deleted = indexValues(w, numValues, val, numVariantList, + withDeletions, 7); + List closeables = new ArrayList(); + IndexReader r = w.getReader(); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + switch (val) { + case PACKED_INTS: + case PACKED_INTS_FIXED: { + Reader intsReader = r.getIndexValues(val.name()); + Source ints = intsReader.load(); + ValuesEnum intsEnum = intsReader.getEnum(); + assertNotNull(intsEnum); + LongsRef enumRef = intsEnum.addAttribute(ValuesAttribute.class).ints(); + for (int i = 0; i < base; i++) { + assertEquals(0, ints.ints(i)); + assertEquals(val.name() + " base: " + base + " index: " + i, i, random.nextBoolean()?intsEnum.advance(i): intsEnum.nextDoc()); + assertEquals(0, enumRef.get()); + } + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + " docs", i, intsEnum.advance(i)); + assertEquals(expected, ints.ints(i)); + assertEquals(expected, enumRef.get()); + + } + } + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: { + Reader floatReader = r.getIndexValues(val.name()); + Source floats = floatReader.load(); + ValuesEnum floatEnum = floatReader.getEnum(); + assertNotNull(floatEnum); + FloatsRef enumRef = floatEnum.addAttribute(ValuesAttribute.class).floats(); + + for (int i = 0; i < base; i++) { + assertEquals(0.0d, floats.floats(i), 0.0d); + assertEquals(i, random.nextBoolean()?floatEnum.advance(i): floatEnum.nextDoc()); + assertEquals("index " + i, 0.0 ,enumRef.get(), 0.0); + } + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + " docs base:" + base, i, floatEnum.advance(i)); + assertEquals("index " + i, 2.0 * expected ,enumRef.get() , 0.00001); + assertEquals("index " + i, 2.0 * expected, floats.floats(i), 0.00001); + } + } + break; + default: + fail("unexpected value " + val); + } + + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + w.close(); + d.close(); + } + + private static EnumSet BYTES = EnumSet.of( + Values.BYTES_FIXED_DEREF, + Values.BYTES_FIXED_SORTED, + Values.BYTES_FIXED_STRAIGHT, + Values.BYTES_VAR_DEREF , + Values.BYTES_VAR_SORTED, + Values.BYTES_VAR_STRAIGHT + ); + + private static EnumSet STRAIGHT_BYTES = EnumSet.of( + Values.BYTES_FIXED_STRAIGHT, + Values.BYTES_VAR_STRAIGHT + ); + + private static EnumSet NUMERICS = EnumSet.of(Values.PACKED_INTS, Values.PACKED_INTS_FIXED, Values.SIMPLE_FLOAT_4BYTE, Values.SIMPLE_FLOAT_8BYTE); + + private static Index[] IDX_VALUES = new Index[] { Index.ANALYZED, Index.ANALYZED_NO_NORMS, Index.NOT_ANALYZED, Index.NOT_ANALYZED_NO_NORMS}; + private OpenBitSet indexValues(IndexWriter w, int numValues, + Values value, List valueVarList, boolean withDeletions, + int multOfSeven) throws CorruptIndexException, IOException { + final boolean isNumeric = NUMERICS.contains(value); + OpenBitSet deleted = new OpenBitSet(numValues); + Document doc = new Document(); + Fieldable field = random.nextBoolean()? new ValuesField(value.name()):newField(value.name(), _TestUtil.randomRealisticUnicodeString(random, 10), IDX_VALUES[random.nextInt(IDX_VALUES.length)]); + doc.add(field); + + ValuesAttribute valuesAttribute = ValuesField.values(field); + valuesAttribute.setType(value); + final LongsRef intsRef = valuesAttribute.ints(); + final FloatsRef floatsRef = valuesAttribute.floats(); + final BytesRef bytesRef = valuesAttribute.bytes(); + + final String idBase = value.name() + "_"; + final byte[] b = new byte[multOfSeven]; + if (bytesRef != null) { + bytesRef.bytes = b; + bytesRef.length = b.length; + bytesRef.offset = 0; + } + // + byte upto = 0; + for (int i = 0; i < numValues; i++) { + if (isNumeric) { + switch (value) { + case PACKED_INTS: + case PACKED_INTS_FIXED: + intsRef.set(i); + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + floatsRef.set(2.0f * i); + break; + default: + fail("unexpected value " + value); + } + } else { + for (int j = 0; j < b.length; j++) { + b[j] = upto++; + } + } + doc.removeFields("id"); + doc.add(new Field("id", idBase + i, Store.YES, + Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + if (i % 7 == 0) { + if (withDeletions && random.nextBoolean()) { + Values val = valueVarList.get(random.nextInt(1 + valueVarList + .indexOf(value))); + final int randInt = val == value ? random.nextInt(1 + i) : random + .nextInt(numValues); + w.deleteDocuments(new Term("id", val.name() + "_" + randInt)); + if (val == value) { + deleted.set(randInt); + } + } + w.commit(); + + } + } + w.commit(); + + // nocommit test unoptimized with deletions + if(withDeletions || random.nextBoolean()) + w.optimize(); + return deleted; + } + + public void runTestIndexBytes(IndexWriterConfig cfg, + boolean withDeletions) throws CorruptIndexException, + LockObtainFailedException, IOException { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final List byteVariantList = new ArrayList(BYTES); + + // run in random order to test if fill works correctly during merges + Collections.shuffle(byteVariantList, random); + final int numValues = 350; + for (Values byteIndexValue : byteVariantList) { + List closeables = new ArrayList(); + + int bytesSize = 7 + random.nextInt(128); + OpenBitSet deleted = indexValues(w, numValues, byteIndexValue, + byteVariantList, withDeletions, bytesSize); + final IndexReader r = w.getReader(); + assertEquals(0, r.numDeletedDocs()); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + + Reader bytesReader = r.getIndexValues(byteIndexValue.name()); +// closeables.add(bytesReader); + assertNotNull("field " + byteIndexValue.name() + + " returned null reader - maybe merged failed", bytesReader); + Source bytes = bytesReader.load(); + ValuesEnum bytesEnum = bytesReader.getEnum(); + assertNotNull(bytesEnum); + final ValuesAttribute attr = bytesEnum.addAttribute(ValuesAttribute.class); + byte upto = 0; + // test the filled up slots for correctness + for (int i = 0; i < base; i++) { + final BytesRef br = bytes.bytes(i); + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs(); + switch (byteIndexValue) { + case BYTES_VAR_STRAIGHT: + case BYTES_FIXED_STRAIGHT: + assertEquals(i, bytesEnum.advance(i)); + // fixed straight returns bytesref with zero bytes all of fixed + // length + assertNotNull("expected none null - " + msg, br); + if(br.length != 0) { + assertEquals("expected zero bytes of length " + bytesSize + " - " + + msg, bytesSize, br.length); + for (int j = 0; j < br.length; j++) { + assertEquals("Byte at index " + j + " doesn't match - " + msg, 0, + br.bytes[br.offset + j]); + } + } + break; + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + case BYTES_VAR_DEREF: + case BYTES_FIXED_DEREF: + default: + assertNotNull("expected none null - " + msg, br); + if(br.length != 0){ + bytes.bytes(i); + } + assertEquals("expected empty bytes - " + br.utf8ToString() + msg, 0, br.length); + } + } + final BytesRef enumRef = attr.bytes(); + + + // test the actual doc values added in this iteration + assertEquals(base + numRemainingValues, r.numDocs()); + int v = 0; + for (int i = base; i < r.numDocs(); i++) { + + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs() + " bytesSize: " + bytesSize; + while (withDeletions && deleted.get(v++)) { + upto += bytesSize; + } + + BytesRef br = bytes.bytes(i); + if(bytesEnum.docID() != i) + assertEquals("seek failed for index " + i + " " + msg, i, bytesEnum.advance(i)); + for (int j = 0; j < br.length; j++, upto++) { + assertEquals("EnumRef Byte at index " + j + " doesn't match - " + msg, + upto, enumRef.bytes[enumRef.offset + j]); + assertEquals("SourceRef Byte at index " + j + " doesn't match - " + msg, + upto, br.bytes[br.offset + j]); + } + } + + // clean up + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + + w.close(); + d.close(); + } + +} Property changes on: src/test/org/apache/lucene/index/values/TestIndexValues.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Values.java =================================================================== --- src/java/org/apache/lucene/index/values/Values.java (revision 0) +++ src/java/org/apache/lucene/index/values/Values.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.index.values; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Controls whether per-field values are stored into + * index. This storage is non-sparse, so it's best to + * use this when all docs have the field, and loads all + * values into RAM, exposing a random access API, when + * loaded. + * + *

NOTE: This feature is experimental and the + * API is free to change in non-backwards-compatible ways. */ +public enum Values { + + /** Integral value is stored as packed ints. The bit + * precision is fixed across the segment, and + * determined by the min/max values in the field. */ + PACKED_INTS, + PACKED_INTS_FIXED, + SIMPLE_FLOAT_4BYTE, + SIMPLE_FLOAT_8BYTE, + + // nocommit -- shouldn't lucene decide/detect straight vs + // deref, as well fixed vs var? + BYTES_FIXED_STRAIGHT, + BYTES_FIXED_DEREF, + BYTES_FIXED_SORTED, + + BYTES_VAR_STRAIGHT, + BYTES_VAR_DEREF, + BYTES_VAR_SORTED + + // nocommit -- need STRING variants as well +} Property changes on: src/java/org/apache/lucene/index/values/Values.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/search/ReqExclScorer.java =================================================================== --- src/java/org/apache/lucene/search/ReqExclScorer.java (revision 1006266) +++ src/java/org/apache/lucene/search/ReqExclScorer.java (working copy) @@ -23,7 +23,7 @@ /** A Scorer for queries with a required subscorer * and an excluding (prohibited) sub DocIdSetIterator. *
- * This Scorer implements {@link Scorer#skipTo(int)}, + * This Scorer implements {@link Scorer#advance(int)}, * and it uses the skipTo() on the given scorers. */ class ReqExclScorer extends Scorer { Index: src/java/org/apache/lucene/index/values/Ints.java =================================================================== --- src/java/org/apache/lucene/index/values/Ints.java (revision 0) +++ src/java/org/apache/lucene/index/values/Ints.java (revision 0) @@ -0,0 +1,32 @@ +package org.apache.lucene.index.values; + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.PackedIntsImpl.IntsReader; +import org.apache.lucene.index.values.PackedIntsImpl.IntsWriter; +import org.apache.lucene.store.Directory; +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public class Ints { + + private Ints() { + } + + public static void files(String id, Collection files) + throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + } + + public static Writer getWriter(Directory dir, String id, boolean useFixedArray) + throws IOException { + //nocommit - implement fixed?! + return new IntsWriter(dir, id); + } + + public static Reader getReader(Directory dir, String id, boolean useFixedArray) throws IOException { + return new IntsReader(dir, id); + } +} Property changes on: src/java/org/apache/lucene/index/values/Ints.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 1006266) +++ src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (working copy) @@ -20,10 +20,16 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.Map.Entry; import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.values.ValuesAttribute; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.RamUsageEstimator; /** @@ -243,10 +249,24 @@ // enabled; we could save [small amount of] CPU // here. quickSort(fields, 0, fieldCount-1); + - for(int i=0;i 0 + private int lastDocID = -1; + private int[] docToAddress; + + public Writer(Directory dir, String id, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, null, bytesUsed); + docToAddress = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + } + + public Writer(Directory dir, String id) throws IOException { + this(dir, id, new AtomicLong()); + } + + // Fills up to but not including this docID + private void fill(final int docID) { + if (docID >= docToAddress.length) { + int oldSize = docToAddress.length; + docToAddress = ArrayUtil.grow(docToAddress, 1 + docID); + bytesUsed.addAndGet(-(docToAddress.length-oldSize)*RamUsageEstimator.NUM_BYTES_INT); + } + for (int i = lastDocID + 1; i < docID; i++) { + docToAddress[i] = address; + } + lastDocID = docID; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default + if (datOut == null) + initDataOut(); + fill(docID); + docToAddress[docID] = address; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; + initIndexOut(); + // write all lengths to index + // write index + fill(docCount); + idxOut.writeVInt(address); + // nocommit -- allow not -1 + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(address)); + for (int i = 0; i < docCount; i++) { + w.add(docToAddress[i]); + } + w.finish(); + bytesUsed.addAndGet(-(docToAddress.length)*RamUsageEstimator.NUM_BYTES_INT); + docToAddress = null; + super.finish(docCount); + } + + public long ramBytesUsed() { + return bytesUsed.get(); + } + } + + public static class Reader extends BytesReaderBase { + private final int maxDoc; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + this.maxDoc = maxDoc; + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), cloneIndex()); + } + + private class Source extends BytesBaseSource { + private final int totBytes; + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader addresses; + + public Source(IndexInput datIn, IndexInput idxIn) throws IOException { + super(datIn, idxIn); + totBytes = idxIn.readVInt(); + data = new byte[totBytes]; + datIn.readBytes(data, 0, totBytes); + addresses = PackedInts.getReader(idxIn); + bytesRef.bytes = data; + } + + @Override + public BytesRef bytes(int docID) { + final int address = (int) addresses.get(docID); + bytesRef.offset = address; + if (docID == maxDoc - 1) { + bytesRef.length = totBytes - bytesRef.offset; + } else { + bytesRef.length = (int) addresses.get(1 + docID) - bytesRef.offset; + } + return bytesRef; + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + public long ramBytesUsed() { + // TODO(simonw): move address ram usage to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + addresses + .getBitsPerValue() + * addresses.size()); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarStrainghtBytesEnum(source, cloneData(), cloneIndex()); + } + + private class VarStrainghtBytesEnum extends ValuesEnum { + private final PackedInts.Reader addresses; + private final IndexInput datIn; + private final IndexInput idxIn; + private final long fp; + private final int totBytes; + private final BytesRef ref; + private int pos = -1; + + protected VarStrainghtBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, Values.BYTES_VAR_STRAIGHT); + totBytes = idxIn.readVInt(); + fp = datIn.getFilePointer(); + addresses = PackedInts.getReader(idxIn); + this.datIn = datIn; + this.idxIn = idxIn; + ref = attr.bytes(); + + } + + @Override + public void close() throws IOException { + datIn.close(); + idxIn.close(); + } + + @Override + public int advance(final int target) throws IOException { + if (target >= maxDoc) { + ref.length = 0; + ref.offset = 0; + return pos = NO_MORE_DOCS; + } + final long addr = addresses.get(target); + if (addr == totBytes) { + // nocommit is that a valid default value + ref.length = 0; + ref.offset = 0; + return pos = target; + } + datIn.seek(fp + addr); + final int size = (int) (target == maxDoc - 1 ? totBytes - addr + : addresses.get(target + 1) - addr); + if (ref.bytes.length < size) + ref.grow(size); + ref.length = size; + datIn.readBytes(ref.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 1006266) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -21,6 +21,8 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.Cache; +import org.apache.lucene.index.values.Reader; import org.apache.lucene.store.*; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -1288,6 +1290,17 @@ throw new UnsupportedOperationException("This reader does not support this method."); } + // nocommit -- should this expose the iterator API via Fields and access Source only via getIndexValuesCache? + public Reader getIndexValues(String field) { + throw new UnsupportedOperationException(); + } + + private final Cache indexValuesCache = new Cache(this); + + // nocommit -- don't expose readers if we have this? + public Cache getIndexValuesCache() { + return indexValuesCache; + } private Fields fields; Index: src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java (revision 0) @@ -0,0 +1,151 @@ +package org.apache.lucene.index.values; + +import java.util.Comparator; + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.SetOnce; + +public class ValuesAttributeImpl extends AttributeImpl implements ValuesAttribute { + private Values type; + private BytesRef bytes = null; + private FloatsRef floats = null; + private LongsRef ints = null; + private Comparator bytesComp; + + public BytesRef bytes() { + return bytes; + } + + public FloatsRef floats() { + return floats; + } + + public LongsRef ints() { + return ints; + } + + public Values type() { + return type; + } + + public void setType(Values type) { + this.type = type; + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + bytes = new BytesRef(); + ints = null; + floats = null; + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + ints = new LongsRef(new long[1], 0, 1); + bytes = null; + floats = null; + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + floats = new FloatsRef(new double[1], 0, 1); + ints = null; + bytes = null; + break; + + } + } + + @Override + public void clear() { + // TODO + } + + @Override + public void copyTo(AttributeImpl target) { + ValuesAttributeImpl other = (ValuesAttributeImpl)target; + other.setType(type); + + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + bytes = (BytesRef) other.bytes.clone(); + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + ints = (LongsRef) other.ints.clone(); + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + floats = (FloatsRef) other.floats.clone(); + break; + + } + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + result = prime * result + ((bytes == null) ? 0 : bytes.hashCode()); + result = prime * result + ((floats == null) ? 0 : floats.hashCode()); + result = prime * result + ((ints == null) ? 0 : ints.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (getClass() != obj.getClass()) + return false; + ValuesAttributeImpl other = (ValuesAttributeImpl) obj; + if (bytes == null) { + if (other.bytes != null) + return false; + } else if (!bytes.equals(other.bytes)) + return false; + if (floats == null) { + if (other.floats != null) + return false; + } else if (!floats.equals(other.floats)) + return false; + if (ints == null) { + if (other.ints != null) + return false; + } else if (!ints.equals(other.ints)) + return false; + if (type == null) { + if (other.type != null) + return false; + } else if (!type.equals(other.type)) + return false; + return true; + } + + public Comparator bytesComparator() { + return bytesComp; + } + + public void setBytesComparator(Comparator comp) { + bytesComp = comp; + } + + + +} Property changes on: src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java (revision 0) @@ -0,0 +1,258 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectAllocator; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +class FixedSortedBytesImpl { + + static final String CODEC_NAME = "FixedSortedBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private int[] docToEntry; + private final Comparator comp; + + + private final BytesRefHash hash = new BytesRefHash(pool); + + public Writer(Directory dir, String id, Comparator comp) throws IOException { + this(dir, id, comp, new DirectAllocator(ByteBlockPool.BYTE_BLOCK_SIZE), + new AtomicLong()); + } + + public Writer(Directory dir, String id, Comparator comp, Allocator allocator, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, new ByteBlockPool(allocator), bytesUsed); + docToEntry = new int[1]; +// docToEntry[0] = -1; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + this.comp = comp; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default - skip it + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + if (docID >= docToEntry.length) { + int[] newArray = new int[ArrayUtil.oversize(1 + docID, + RamUsageEstimator.NUM_BYTES_INT)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); +// Arrays.fill(newArray, docToEntry.length, newArray.length, -1); + + bytesUsed.addAndGet((newArray.length - docToEntry.length) * RamUsageEstimator.NUM_BYTES_INT); + docToEntry = newArray; + } + int e = hash.add(bytes); + docToEntry[docID] = 1+(e < 0? (-e)-1: e); + } + + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if(datOut == null)// no data added + return; + initIndexOut(); + final int[] sortedEntries = hash.sort(comp); + final int count = hash.size(); + int[] address= new int[count]; + // first dump bytes data, recording address as we go + for(int i=0;i docToEntry.length) { + limit = docToEntry.length; + } else { + limit = docCount; + } + for(int i=0;i 0 && e <= count: "index must 0 > && <= " + count + " was: " + e; + w.add(address[e-1]); + } + } + + for(int i=limit;i comp) throws IOException { + return new Source(cloneData(), cloneIndex(), size, comp); + } + + private static class Source extends BytesBaseSortedSource { + + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader index; + private final LookupResult lookupResult = new LookupResult(); + private final int numValue; + private final Comparator comp; + private final int size; + + public Source(IndexInput datIn, IndexInput idxIn, int size, Comparator comp) throws IOException { + super(datIn, idxIn); + this.size = size; + datIn.seek(CodecUtil.headerLength(CODEC_NAME) + 4); + idxIn.seek(CodecUtil.headerLength(CODEC_NAME)); + + numValue = idxIn.readInt(); + data = new byte[size*numValue]; + datIn.readBytes(data, 0, size*numValue); + datIn.close(); + + index = PackedInts.getReader(idxIn); + idxIn.close(); // do we need to close that here? + + bytesRef.bytes = data; + bytesRef.length = size; + // default byte sort order + this.comp = comp==null?BytesRef.getUTF8SortedAsUnicodeComparator():comp; + } + + @Override + public int ord(int docID) { + return (int) index.get(docID); + } + + @Override + public BytesRef getByOrd(int ord) { + if (ord == 0) { + return defaultValue; + } else { + bytesRef.offset = ((ord-1) * size); + return bytesRef; + } + } + + @Override + public LookupResult getByValue(BytesRef bytes) { + return binarySearch(bytes, 0, numValue-1); + } + + public long ramBytesUsed() { + // TODO(simonw): move ram calcultation to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + index.getBitsPerValue() * index.size()); + } + + @Override + public int getValueCount() { + return numValue; + } + + private LookupResult binarySearch(BytesRef b, int low, int high) { + + while (low <= high) { + int mid = (low + high) >>> 1; + bytesRef.offset = mid * size; + int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid+1; + lookupResult.found = true; + return lookupResult; + } + } + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + // do unsorted + return new DerefBytesEnum(source, cloneData(), cloneIndex(), CODEC_NAME, size); + } + } +} Property changes on: src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Cache.java =================================================================== --- src/java/org/apache/lucene/index/values/Cache.java (revision 0) +++ src/java/org/apache/lucene/index/values/Cache.java (revision 0) @@ -0,0 +1,116 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.Reader.SortedSource; +import org.apache.lucene.index.values.Reader.Source; +import org.apache.lucene.util.BytesRef; + +public class Cache { + final IndexReader r; + // TODO(simonw): use WeakHashMaps instead here? + final Map ints = new HashMap(); + final Map floats = new HashMap(); + final Map bytes = new HashMap(); + final Map sortedBytes = new HashMap(); + + public Cache(IndexReader r) { + this.r = r; + } + + synchronized public Source getInts(String id) throws IOException { + Source s = ints.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + ints.put(id, s); + } + + return s; + } + + synchronized public Source getFloats(String id) throws IOException { + Source s = floats.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + floats.put(id, s); + } + + return s; + } + + synchronized public SortedSource getSortedBytes(String id, + Comparator comp) throws IOException { + SortedSource s = sortedBytes.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.loadSorted(comp); + sortedBytes.put(id, s); + } else { + // TODO(simonw): verify comp is the same! + } + + return s; + } + + synchronized public Source getBytes(String id) throws IOException { + Source s = bytes.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + bytes.put(id, s); + } + + return s; + } + + public void purgeInts(String id) { + ints.remove(id); + } + + public void purgeFloats(String id) { + floats.remove(id); + } + + public void purgeBytes(String id) { + bytes.remove(id); + } + + public void purgeSortedBytes(String id) { + sortedBytes.remove(id); + } +} Property changes on: src/java/org/apache/lucene/index/values/Cache.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/BytesRefHash.java =================================================================== --- src/java/org/apache/lucene/util/BytesRefHash.java (revision 1006266) +++ src/java/org/apache/lucene/util/BytesRefHash.java (working copy) @@ -353,6 +353,7 @@ // 1 byte to store length buffer[bufferUpto] = (byte) length; pool.byteUpto += length + 1; + assert length >= 0: "Length must be positive: " + length; System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, length); } else { @@ -569,4 +570,64 @@ } } + + public static class ParallelBytesStartArray> extends BytesStartArray { + private final T prototype; + public T array; + + public ParallelBytesStartArray(T template) { + this.prototype = template; + } + @Override + public int[] init() { + if(array == null) { + array = prototype.newInstance(2); + } + return array.textStart; + } + + @Override + public int[] grow() { + array = array.grow(); + return array.textStart; + } + + @Override + public int[] clear() { + if(array != null) { + array.deref(); + array = null; + } + return null; + } + + @Override + public AtomicLong bytesUsed() { + return array.bytesUsed(); + } + + } + + public abstract static class ParallelArrayBase> extends ParallelArray { + final int[] textStart; + + protected ParallelArrayBase(int size, AtomicLong bytesUsed) { + super(size, bytesUsed); + textStart = new int[size]; + } + + @Override + protected int bytesPerEntry() { + return RamUsageEstimator.NUM_BYTES_INT; + } + + @Override + protected void copyTo(T toArray, int numToCopy) { + System.arraycopy(textStart, 0, toArray.textStart, 0, size); + } + + @Override + public abstract T newInstance(int size); + + } } Index: src/java/org/apache/lucene/index/CompoundFileWriter.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileWriter.java (revision 1006266) +++ src/java/org/apache/lucene/index/CompoundFileWriter.java (working copy) @@ -49,9 +49,13 @@ */ final class CompoundFileWriter { - private static final class FileEntry { + static final class FileEntry { + + FileEntry(String file) { + this.file = file; + } /** source file */ - String file; + final String file; /** temporary holder for the start of directory entry for this file */ long directoryOffset; @@ -128,10 +132,7 @@ if (! ids.add(file)) throw new IllegalArgumentException( "File " + file + " already added"); - - FileEntry entry = new FileEntry(); - entry.file = file; - entries.add(entry); + entries.add(new FileEntry(file)); } /** Merge files with the extensions added up to now. Index: src/java/org/apache/lucene/index/values/Floats.java =================================================================== --- src/java/org/apache/lucene/index/values/Floats.java (revision 0) +++ src/java/org/apache/lucene/index/values/Floats.java (revision 0) @@ -0,0 +1,389 @@ +package org.apache.lucene.index.values; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.DoubleBuffer; +import java.nio.FloatBuffer; +import java.util.Collection; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Exposes writer/reader for floating point values. You can specify 4 (java + * float) or 8 (java double) byte precision. + */ +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public class Floats { + private static final String CODEC_NAME = "SimpleFloats"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + private static final int INT_ZERO = Float.floatToRawIntBits(0.0f); + private static final long LONG_ZERO = Double.doubleToRawLongBits(0.0); + + public static void files(String id, Collection files) { + files.add(id + "." + IndexFileNames.CSF_DATA_EXTENSION); + } + + public static Writer getWriter(Directory dir, String id, int precisionBytes) + throws IOException { + if (precisionBytes != 4 && precisionBytes != 8) { + throw new IllegalArgumentException("precisionBytes must be 4 or 8; got " + + precisionBytes); + } + if (precisionBytes == 4) { + return new Float4Writer(dir, id); + } else { + return new Float8Writer(dir, id); + } + } + + public static Reader getReader(Directory dir, String id, int maxDoc) + throws IOException { + return new FloatsReader(dir, id, maxDoc); + } + + abstract static class FloatsWriter extends Writer { + private final Directory dir; + private final String id; + private FloatsRef floatsRef; + protected int lastDocId = -1; + protected IndexOutput datOut; + private final byte precision; + + protected FloatsWriter(Directory dir, String id, int precision) + throws IOException { + this.dir = dir; + this.id = id; + this.precision = (byte) precision; + } + + protected void initDatOut() throws IOException { + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + assert datOut.getFilePointer() == CodecUtil.headerLength(CODEC_NAME); + datOut.writeByte(precision); + } + + public long ramBytesUsed() { + return 0; + } + + @Override + protected void add(int docID) throws IOException { + add(docID, floatsRef.get()); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + floatsRef = attr.floats(); + } + + protected abstract int fillDefault(int num) throws IOException; + + @Override + protected void merge(MergeState state) throws IOException { + if (state.bits == null && state.reader instanceof FloatsReader) { + // no deletes - bulk copy + // nocommit - should be do bulks with deletes too? + final FloatsReader reader = (FloatsReader) state.reader; + assert reader.precisionBytes == (int) precision; + if (reader.maxDoc == 0) + return; + if (datOut == null) + initDatOut(); + final int docBase = state.docBase; + if (docBase - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docBase - lastDocId - 1); + } + lastDocId += reader.transferTo(datOut); + } else + super.merge(state); + } + + } + + // Writes 4 bytes (float) per value + static class Float4Writer extends FloatsWriter { + + protected Float4Writer(Directory dir, String id) throws IOException { + super(dir, id, 4); + } + + @Override + synchronized public void add(final int docID, final double v) + throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (datOut == null) { + initDatOut(); + } + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeInt(Float.floatToRawIntBits((float) v)); + ++lastDocId; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; // no data added - don't create file! + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeInt(INT_ZERO); // default value + } + datOut.close(); + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeInt(INT_ZERO); + } + return numValues; + } + } + + // Writes 8 bytes (double) per value + static class Float8Writer extends FloatsWriter { + + protected Float8Writer(Directory dir, String id) throws IOException { + super(dir, id, 8); + } + + @Override + synchronized public void add(int docID, double v) throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (datOut == null) { + initDatOut(); + } + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeLong(Double.doubleToRawLongBits(v)); + ++lastDocId; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; // no data added - don't create file! + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeLong(LONG_ZERO); // default value + } + datOut.close(); + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeLong(LONG_ZERO); + } + return numValues; + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class FloatsReader extends Reader { + + private final IndexInput datIn; + private final int precisionBytes; + // TODO(simonw) is ByteBuffer the way to go here? + private final int maxDoc; + + protected FloatsReader(Directory dir, String id, int maxDoc) + throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + precisionBytes = datIn.readByte(); + assert precisionBytes == 4 || precisionBytes == 8; + this.maxDoc = maxDoc; + } + + int transferTo(IndexOutput out) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + try { + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + out.copyBytes(indexInput, precisionBytes * maxDoc); + } finally { + indexInput.close(); + } + return maxDoc; + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(precisionBytes * maxDoc); + IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + assert buffer.hasArray() : "Buffer must support Array"; + final byte[] arr = buffer.array(); + indexInput.readBytes(arr, 0, arr.length); + return precisionBytes == 4 ? new Source4(buffer) : new Source8(buffer); + } + + private class Source4 extends Source { + private final FloatBuffer values; + + Source4(ByteBuffer buffer) { + values = buffer.asFloatBuffer(); + } + + @Override + public double floats(int docID) { + final float f = values.get(docID); + // nocommit should we return NaN as default instead of 0.0? + return Float.isNaN(f) ? 0.0f : f; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + values.limit() + * RamUsageEstimator.NUM_BYTES_FLOAT; + } + } + + private class Source8 extends Source { + private final DoubleBuffer values; + + Source8(ByteBuffer buffer) { + values = buffer.asDoubleBuffer(); + } + + @Override + public double floats(int docID) { + final double d = values.get(docID); + // nocommit should we return NaN as default instead of 0.0? + return Double.isNaN(d) ? 0.0d : d; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + values.limit() + * RamUsageEstimator.NUM_BYTES_DOUBLE; + } + } + + public void close() throws IOException { + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + return precisionBytes == 4 ? new Floats4Enum(source, indexInput, maxDoc) + : new Floats8EnumImpl(source, indexInput, maxDoc); + } + } + + static final class Floats4Enum extends FloatsEnumImpl { + + Floats4Enum(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 4, maxDoc, Values.SIMPLE_FLOAT_4BYTE); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + dataIn.seek(fp + (target * precision)); + ref.floats[0] = Float.intBitsToFloat(dataIn.readInt()); + ref.offset = 0; // nocommit -- can we igore this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + + private static final class Floats8EnumImpl extends FloatsEnumImpl { + + Floats8EnumImpl(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 8, maxDoc, Values.SIMPLE_FLOAT_8BYTE); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + dataIn.seek(fp + (target * precision)); + ref.floats[0] = Double.longBitsToDouble(dataIn.readLong()); + ref.offset = 0; // nocommit -- can we igore this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + + static abstract class FloatsEnumImpl extends ValuesEnum { + protected final IndexInput dataIn; + protected int pos = -1; + protected final int precision; + protected final int maxDoc; + protected final long fp; + protected final FloatsRef ref; + + FloatsEnumImpl(AttributeSource source, IndexInput dataIn, int precision, + int maxDoc, Values type) throws IOException { + super(source, precision == 4 ? Values.SIMPLE_FLOAT_4BYTE + : Values.SIMPLE_FLOAT_8BYTE); + this.dataIn = dataIn; + this.precision = precision; + this.maxDoc = maxDoc; + fp = dataIn.getFilePointer(); + this.ref = attr.floats(); + this.ref.offset = 0; + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/Floats.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/packed/Packed64.java =================================================================== --- src/java/org/apache/lucene/util/packed/Packed64.java (revision 1006266) +++ src/java/org/apache/lucene/util/packed/Packed64.java (working copy) @@ -182,7 +182,7 @@ final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); final int base = bitPos * FAC_BITPOS; - + assert elementPos < blocks.length : "elementPos: " + elementPos + "; blocks.len: " + blocks.length; return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); } Index: src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java (revision 0) @@ -0,0 +1,262 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectAllocator; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +class FixedDerefBytesImpl { + + static final String CODEC_NAME = "FixedDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private int[] docToID; + private final BytesRefHash hash = new BytesRefHash(pool); + + public Writer(Directory dir, String id) throws IOException { + this(dir, id, new DirectAllocator(ByteBlockPool.BYTE_BLOCK_SIZE), + new AtomicLong()); + } + + public Writer(Directory dir, String id, Allocator allocator, + AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, + new ByteBlockPool(allocator), bytesUsed); + docToID = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) // default value - skip it + return; + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + + " but got " + bytes.length); + } + int ord = hash.add(bytes); + + if (ord >= 0) { + // new added entry + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } else { + ord = (-ord)-1; + } + + if (docID >= docToID.length) { + int size = docToID.length; + docToID = ArrayUtil.grow(docToID, 1 + docID); + bytesUsed.addAndGet((docToID.length - size) * RamUsageEstimator.NUM_BYTES_INT); + } + docToID[docID] = 1+ord; + } + + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) // no added data + return; + initIndexOut(); + final int count = 1+hash.size(); + idxOut.writeInt(count - 1); + // write index + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(count - 1)); + final int limit = docCount > docToID.length ? docToID.length : docCount; + for (int i = 0; i < limit; i++) { + w.add(docToID[i]); + } + // fill up remaining doc with zeros + for (int i = limit; i < docCount; i++) { + w.add(0); + } + w.finish(); + hash.clear(); + + super.finish(docCount); + } + } + + public static class Reader extends BytesReaderBase { + private final int size; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + try { + size = datIn.readInt(); + } catch (IOException e) { + throw e; + } + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), cloneIndex(), size); + } + + private static class Source extends BytesBaseSource { + // TODO: paged data or mmap? + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader index; + private final int numValue; + private final int size; + + protected Source(IndexInput datIn, IndexInput idxIn, int size) + throws IOException { + super(datIn, idxIn); + this.size = size; + numValue = idxIn.readInt(); + data = new byte[size * numValue]; + datIn.readBytes(data, 0, size * numValue); + index = PackedInts.getReader(idxIn); + bytesRef.bytes = data; + bytesRef.length = size; + } + + @Override + public BytesRef bytes(int docID) { + final int id = (int) index.get(docID); + if (id == 0) { + return defaultValue; + } + bytesRef.offset = ((id - 1) * size); + return bytesRef; + } + + public long ramBytesUsed() { + // TODO(simonw): move ram calculation to PackedInts?! + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + index + .getBitsPerValue() + * index.size()); + } + + @Override + public int getValueCount() { + return numValue; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new DerefBytesEnum(source, cloneData(), cloneIndex(), CODEC_NAME, + size); + } + + static class DerefBytesEnum extends ValuesEnum { + protected final IndexInput datIn; + private final PackedInts.ReaderIterator idx; + protected final long fp; + private final int size; + protected final BytesRef ref; + private final int valueCount; + private int pos = -1; + + public DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, String codecName, int size) throws IOException { + this(source, datIn, idxIn, codecName, size, Values.BYTES_FIXED_DEREF); + } + + protected DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, String codecName, int size, Values enumType) + throws IOException { + super(source, enumType); + ref = attr.bytes(); + this.datIn = datIn; + this.size = size == -1 ? 128 : size; + idxIn.readInt();// read valueCount + idx = PackedInts.getReaderIterator(idxIn); + fp = datIn.getFilePointer(); + ref.grow(this.size); + ref.length = this.size; + ref.offset = 0; + valueCount = idx.size(); + } + + @Override + public int advance(int target) throws IOException { + if (target < valueCount) { + final long address = idx.advance(target); + pos = idx.ord(); + if(address == 0) { + // default is empty + ref.length = 0; + ref.offset = 0; + return pos; + } + fill(address, ref); + return pos; + } + return pos = NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + + public void close() throws IOException { + datIn.close(); + idx.close(); + } + + protected void fill(long address, BytesRef ref) throws IOException { + datIn.seek(fp + ((address - 1) * size)); + datIn.readBytes(ref.bytes, 0, size); + ref.length = size; + ref.offset = 0; + } + + @Override + public int docID() { + return pos; + } + + } + } + +} Property changes on: src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/FloatsRef.java =================================================================== --- src/java/org/apache/lucene/util/FloatsRef.java (revision 0) +++ src/java/org/apache/lucene/util/FloatsRef.java (revision 0) @@ -0,0 +1,91 @@ +/** + * + */ +package org.apache.lucene.util; + + +public final class FloatsRef implements Cloneable{ + public double[] floats; + public int offset; + public int length; + + public FloatsRef() { + } + + public FloatsRef(int capacity) { + floats = new double[capacity]; + } + + public void set(double value) { + floats[offset] = value; + } + + public double get() { + return floats[offset]; + } + + public FloatsRef(double[] floats, int offset, int length) { + this.floats = floats; + this.offset = offset; + this.length = length; + } + + public FloatsRef(FloatsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new FloatsRef(this); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i = offset; i < end; i++) { + long value = Double.doubleToLongBits(floats[i]); + result = prime * result + (int) (value ^ (value >>> 32)); + } + return result; + } + + @Override + public boolean equals(Object other) { + return other instanceof FloatsRef && this.floatsEquals((FloatsRef) other); + } + + public boolean floatsEquals(FloatsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final double[] otherFloats = other.floats; + final int end = offset + length; + for(int upto=offset;upto fieldVauleMap; // create a doc // use only part of the body, modify it to keep the rest (or use all if size==0). // reset the docdata properties so they are not added more than once. private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { - + Values valueType; final DocState ds = getDocState(); final Document doc = reuseFields ? ds.doc : new Document(); doc.getFields().clear(); @@ -178,6 +182,7 @@ name = cnt < 0 ? name : name + "_" + cnt; Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal); nameField.setValue(name); + trySetIndexValues(nameField); doc.add(nameField); // Set DATE_FIELD @@ -187,12 +192,14 @@ } Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); dateField.setValue(date); + trySetIndexValues(dateField); doc.add(dateField); // Set TITLE_FIELD String title = docData.getTitle(); Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal); titleField.setValue(title == null ? "" : title); + trySetIndexValues(titleField); doc.add(titleField); String body = docData.getBody(); @@ -214,12 +221,15 @@ } Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal); bodyField.setValue(bdy); + trySetIndexValues(bodyField); doc.add(bodyField); if (storeBytes) { Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); bytesField.setValue(bdy.getBytes("UTF-8")); + trySetIndexValues(bytesField); doc.add(bytesField); + } } @@ -229,6 +239,7 @@ for (final Map.Entry entry : props.entrySet()) { Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal); f.setValue((String) entry.getValue()); + trySetIndexValues(f); doc.add(f); } docData.setProps(null); @@ -238,6 +249,12 @@ //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; } + + private void trySetIndexValues(Field field) { + final Values valueType; + if((valueType = fieldVauleMap.get(field.name())) != null) + ValuesField.set(field, valueType); + } private void resetLeftovers() { leftovr.set(null); @@ -367,6 +384,22 @@ resetLeftovers(); } + private static final Map parseValueFields(String fields) { + if(fields == null) + return Collections.emptyMap(); + String[] split = fields.split(";"); + Map result = new HashMap(); + for (String tuple : split) { + final String[] nameValue = tuple.split(":"); + if (nameValue.length != 2) { + throw new IllegalArgumentException("illegal doc.stored.values format: " + + fields + " expected fieldname:ValuesType;...;...;"); + } + result.put(nameValue[0].trim(), Values.valueOf(nameValue[1])); + } + return result; + } + /** Set the configuration parameters of this doc maker. */ public void setConfig(Config config) { this.config = config; @@ -386,6 +419,7 @@ boolean norms = config.get("doc.tokenized.norms", false); boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean termVec = config.get("doc.term.vector", false); + fieldVauleMap = parseValueFields(config.get("doc.stored.values", null)); storeVal = (stored ? Field.Store.YES : Field.Store.NO); bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO); if (tokenized) { @@ -423,7 +457,6 @@ docState = new ThreadLocal(); indexProperties = config.get("doc.index.props", false); - updateDocIDLimit = config.get("doc.random.id.limit", -1); if (updateDocIDLimit != -1) { r = new Random(179); Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 1006266) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -31,6 +31,12 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.MergeState; import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.Writer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -157,6 +163,8 @@ if (mergeDocStores && fieldInfos.hasVectors()) mergeVectors(); + mergeIndexValues(); + return mergedDocs; } @@ -170,6 +178,12 @@ reader.close(); } } + + private void addIfExists(Set files, String file, Directory dir) throws IOException{ + if(dir.fileExists(file)){ + files.add(file); + } + } final List createCompoundFile(String fileName, final SegmentInfo info) throws IOException { @@ -183,13 +197,20 @@ !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) fileSet.add(IndexFileNames.segmentFileName(segment, "", ext)); } - codec.files(directory, info, fileSet); // Fieldable norm files - int numFIs = fieldInfos.size(); + final int numFIs = fieldInfos.size(); for (int i = 0; i < numFIs; i++) { - FieldInfo fi = fieldInfos.fieldInfo(i); + final FieldInfo fi = fieldInfos.fieldInfo(i); + // Index Values aka. CSF + if (fi.indexValues != null) { + addIfExists(fileSet, IndexFileNames.segmentFileName(segment, Integer + .toString(fi.number), IndexFileNames.CSF_DATA_EXTENSION), directory); + addIfExists(fileSet, IndexFileNames.segmentFileName(segment, Integer + .toString(fi.number), IndexFileNames.CSF_INDEX_EXTENSION), + directory); + } if (fi.isIndexed && !fi.omitNorms) { fileSet.add(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION)); break; @@ -288,10 +309,18 @@ int numReaderFieldInfos = readerFieldInfos.size(); for (int j = 0; j < numReaderFieldInfos; j++) { FieldInfo fi = readerFieldInfos.fieldInfo(j); - fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, - fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, - !reader.hasNorms(fi.name), fi.storePayloads, - fi.omitTermFreqAndPositions); + FieldInfo merged = fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, + fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, + !reader.hasNorms(fi.name), fi.storePayloads, + fi.omitTermFreqAndPositions); + final Values fiIndexValues = fi.indexValues; + final Values mergedIndexValues = merged.indexValues; + if (mergedIndexValues == null) { + merged.setIndexValues(fiIndexValues); + } else if (mergedIndexValues != fiIndexValues) { + // nocommit -- what to do? + throw new IllegalStateException("cannot merge field " + fi.name + " indexValues changed from " + mergedIndexValues + " to " + fiIndexValues); + } } } else { addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); @@ -302,6 +331,8 @@ addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false); + + // nocommit -- how should we handle index values here? } } fieldInfos.write(directory, segment + ".fnm"); @@ -362,6 +393,77 @@ return docCount; } + private void mergeIndexValues() throws IOException { + final int numFields = fieldInfos.size(); + for (int i = 0; i < numFields; i++) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(i); + final Values v = fieldInfo.indexValues; + // nocommit we need some kind of compatibility notation for values such + // that two slighly different segments can be merged eg. fixed vs. + // variable byte len or float32 vs. float64 + + if (v != null) { + int docBase = 0; + final List mergeStates = new ArrayList(); + for (IndexReader reader : readers) { + Reader r = reader.getIndexValues(fieldInfo.name); + if (r != null) { + mergeStates.add(new Writer.MergeState(r, docBase, reader + .maxDoc(), reader.getDeletedDocs())); + } + docBase += reader.numDocs(); + } + if (mergeStates.isEmpty()) { + continue; + } + final String id = segment + "_" + fieldInfo.number; + final Writer writer; + switch (v) { + case PACKED_INTS: + case PACKED_INTS_FIXED: + writer = Ints.getWriter(directory, id, true); + break; + case SIMPLE_FLOAT_4BYTE: + writer = Floats.getWriter(directory, id, 4); + break; + case SIMPLE_FLOAT_8BYTE: + writer = Floats.getWriter(directory, id, 8); + break; + case BYTES_FIXED_STRAIGHT: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.STRAIGHT, null, true); + break; + case BYTES_FIXED_DEREF: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.DEREF, null, true); + break; + case BYTES_FIXED_SORTED: + // nocommit -- enable setting Comparator + writer = Bytes.getWriter(directory, id, + Bytes.Mode.SORTED, null, true); + break; + case BYTES_VAR_STRAIGHT: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.STRAIGHT, null, false); + break; + case BYTES_VAR_DEREF: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.DEREF, null, false); + break; + case BYTES_VAR_SORTED: + // nocommit -- enable setting Comparator + writer = Bytes.getWriter(directory, id, + Bytes.Mode.SORTED, null, false); + break; + default: + continue; + } + writer.add(mergeStates); + writer.finish(mergedDocs); + } + } + } + private int copyFieldsWithDeletions(final FieldsWriter fieldsWriter, final IndexReader reader, final FieldsReader matchingFieldsReader) throws IOException, MergeAbortedException, CorruptIndexException { Index: src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- src/test/org/apache/lucene/util/_TestUtil.java (revision 1006266) +++ src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -117,6 +117,37 @@ } return new String(buffer, 0, end); } + + public static String randomUnicodeString(Random r, int minLength, int maxLength) { + if(minLength > maxLength) + throw new IllegalArgumentException("minLength must be >= maxLength"); + final boolean lenEqual = minLength==maxLength; + final int end = lenEqual?minLength:minLength + r.nextInt(maxLength-minLength+1); + if (end == 0) { + // allow 0 length + return ""; + } + + // TODO(simonw): check this + final int fixedPlane = 5;//minLength % 5; + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = lenEqual? fixedPlane: r.nextInt(5); + //buffer[i] = (char) (97 + r.nextInt(26)); + if (0 == t && i < end - 1 && !lenEqual) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); + // Low surrogate + buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff); + } + else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); + else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800); + else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); + else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff); + } + return new String(buffer, 0, end); + } private static final int[] blockStarts = { 0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, Index: src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== --- src/java/org/apache/lucene/index/FilterIndexReader.java (revision 1006266) +++ src/java/org/apache/lucene/index/FilterIndexReader.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.values.ValuesEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close Index: src/java/org/apache/lucene/document/Fieldable.java =================================================================== --- src/java/org/apache/lucene/document/Fieldable.java (revision 1006266) +++ src/java/org/apache/lucene/document/Fieldable.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.FieldInvertState; // for javadocs import org.apache.lucene.search.PhraseQuery; // for javadocs import org.apache.lucene.search.spans.SpanQuery; // for javadocs +import org.apache.lucene.util.AttributeSource; import java.io.Reader; import java.io.Serializable; @@ -209,4 +210,7 @@ * silently fail to find results. */ void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + + boolean hasFieldAttribute(); + AttributeSource getFieldAttributes(); } Index: src/java/org/apache/lucene/index/IndexFileNames.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileNames.java (revision 1006266) +++ src/java/org/apache/lucene/index/IndexFileNames.java (working copy) @@ -78,6 +78,12 @@ /** Extension of separate norms */ public static final String SEPARATE_NORMS_EXTENSION = "s"; + + /** Extension of Column-Stride Filed data files */ + public static final String CSF_DATA_EXTENSION = "dat"; + + /** Extension of Column-Stride Filed index files */ + public static final String CSF_INDEX_EXTENSION = "idx"; /** * This array contains all filename extensions used by @@ -98,6 +104,8 @@ GEN_EXTENSION, NORMS_EXTENSION, COMPOUND_FILE_STORE_EXTENSION, + CSF_DATA_EXTENSION, + CSF_INDEX_EXTENSION }; public static final String[] STORE_INDEX_EXTENSIONS = new String[] { Index: src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- src/java/org/apache/lucene/util/ArrayUtil.java (revision 1006266) +++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -247,6 +247,19 @@ public static short[] grow(short[] array) { return grow(array, 1 + array.length); } + + public static double[] grow(double[] array, int minSize) { + if (array.length < minSize) { + double[] newArray = new double[oversize(minSize, RamUsageEstimator.NUM_BYTES_DOUBLE)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static double[] grow(double[] array) { + return grow(array, 1 + array.length); + } public static short[] shrink(short[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); Index: src/java/org/apache/lucene/search/SortField.java =================================================================== --- src/java/org/apache/lucene/search/SortField.java (revision 1006266) +++ src/java/org/apache/lucene/search/SortField.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.Serializable; +import java.util.Comparator; import java.util.Locale; import org.apache.lucene.search.cache.ByteValuesCreator; @@ -29,6 +30,11 @@ import org.apache.lucene.search.cache.LongValuesCreator; import org.apache.lucene.search.cache.ShortValuesCreator; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRef; + +// nocommit -- for cleaner transition, maybe we should make +// a new SortField that subclasses this one and always uses +// index values? /** * Stores information about how to sort documents by terms in an individual @@ -90,6 +96,9 @@ * uses ordinals to do the sorting. */ public static final int STRING_VAL = 11; + /** Sort use byte[] index values. */ + public static final int BYTES = 12; + /** Represents sorting by document score (relevancy). */ public static final SortField FIELD_SCORE = new SortField (null, SCORE); @@ -440,6 +449,26 @@ field = StringHelper.intern(field); } + private boolean useIndexValues; + + public void setUseIndexValues(boolean b) { + useIndexValues = b; + } + + public boolean getUseIndexValues() { + return useIndexValues; + } + + private Comparator bytesComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); + + public void setBytesComparator(Comparator b) { + bytesComparator = b; + } + + public Comparator getBytesComparator() { + return bytesComparator; + } + /** Returns the {@link FieldComparator} to use for * sorting. * @@ -469,10 +498,18 @@ return new FieldComparator.DocComparator(numHits); case SortField.INT: - return new FieldComparator.IntComparator(numHits, (IntValuesCreator)creator, (Integer)missingValue ); + if (useIndexValues) { + return new FieldComparator.IntIndexValuesComparator(numHits, field); + } else { + return new FieldComparator.IntComparator(numHits, (IntValuesCreator)creator, (Integer) missingValue); + } case SortField.FLOAT: - return new FieldComparator.FloatComparator(numHits, (FloatValuesCreator)creator, (Float)missingValue ); + if (useIndexValues) { + return new FieldComparator.FloatIndexValuesComparator(numHits, field); + } else { + return new FieldComparator.FloatComparator(numHits, (FloatValuesCreator) creator, (Float) missingValue); + } case SortField.LONG: return new FieldComparator.LongComparator(numHits, (LongValuesCreator)creator, (Long)missingValue ); Index: src/java/org/apache/lucene/util/ParallelArray.java =================================================================== --- src/java/org/apache/lucene/util/ParallelArray.java (revision 0) +++ src/java/org/apache/lucene/util/ParallelArray.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.util; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.concurrent.atomic.AtomicLong; + +/** + * + * @lucene.internal + */ +public abstract class ParallelArray> { + + public final int size; + protected final AtomicLong bytesUsed; + + protected ParallelArray(final int size, AtomicLong bytesUsed) { + this.size = size; + this.bytesUsed = bytesUsed; + bytesUsed.addAndGet((size) * bytesPerEntry()); + + } + + protected abstract int bytesPerEntry(); + + public AtomicLong bytesUsed() { + return bytesUsed; + } + + public void deref() { + bytesUsed.addAndGet((-size) * bytesPerEntry()); + } + + public abstract T newInstance(int size); + + public final T grow() { + int newSize = ArrayUtil.oversize(size + 1, bytesPerEntry()); + T newArray = newInstance(newSize); + copyTo(newArray, size); + bytesUsed.addAndGet((newSize - size) * bytesPerEntry()); + return newArray; + } + + protected abstract void copyTo(T toArray, int numToCopy); +} Property changes on: src/java/org/apache/lucene/util/ParallelArray.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/IOUtils.java =================================================================== --- src/java/org/apache/lucene/util/IOUtils.java (revision 1006266) +++ src/java/org/apache/lucene/util/IOUtils.java (working copy) @@ -20,6 +20,8 @@ import java.io.Closeable; import java.io.IOException; +import org.apache.lucene.store.DataOutput; + /** @lucene.internal */ public class IOUtils { /** @@ -61,4 +63,32 @@ else if (firstIOE != null) throw firstIOE; } + + /** + * Writes the length of the {@link BytesRef} as either a one or two bytes to + * the {@link DataOutput} and returns the number of bytes used. + * + * @param datOut + * the output to write to + * @param bytes + * the length to write + * @return the length of the {@link BytesRef} as either a one or two bytes to + * the {@link DataOutput} and returns the number of bytes used. + * @throws IOException + * if datOut throws an {@link IOException} + */ + public static int writeLength(DataOutput datOut, BytesRef bytes) + throws IOException { + final int length = bytes.length; + if (length < 128) { + // 1 byte to store length + datOut.writeByte((byte) length); + return 1; + } else { + // 2 byte to store length + datOut.writeByte((byte) (0x80 | (length & 0x7f))); + datOut.writeByte((byte) ((length >> 7) & 0xff)); + return 2; + } + } } Index: src/java/org/apache/lucene/index/CompoundFileReader.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileReader.java (revision 1006266) +++ src/java/org/apache/lucene/index/CompoundFileReader.java (working copy) @@ -157,7 +157,7 @@ throw new IOException("Stream closed"); id = IndexFileNames.stripSegmentName(id); - FileEntry entry = entries.get(id); + final FileEntry entry = entries.get(id); if (entry == null) throw new IOException("No sub-file with id " + id + " found"); Index: src/java/org/apache/lucene/document/ValuesField.java =================================================================== --- src/java/org/apache/lucene/document/ValuesField.java (revision 0) +++ src/java/org/apache/lucene/document/ValuesField.java (revision 0) @@ -0,0 +1,136 @@ +package org.apache.lucene.document; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.io.Reader; +import java.util.Comparator; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.ValuesAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; + +/** + * + */ +@SuppressWarnings("serial") +public class ValuesField extends AbstractField { + private final ValuesAttribute attr; + private final AttributeSource fieldAttributes; + + + public ValuesField(String name) { + super(name, Store.NO, Index.NO, TermVector.NO); + fieldAttributes = getFieldAttributes(); + attr = fieldAttributes.addAttribute(ValuesAttribute.class); + } + + ValuesField() { + this(""); + } + + public Reader readerValue() { + return null; + } + + public String stringValue() { + return null; + } + + public TokenStream tokenStreamValue() { + return tokenStream; + } + + public void setInt(long value) { + attr.setType(Values.PACKED_INTS); + attr.ints().set(value); + } + + public void setFloat(float value) { + attr.setType(Values.SIMPLE_FLOAT_4BYTE); + attr.floats().set(value); + } + + public void setFloat(double value) { + attr.setType(Values.SIMPLE_FLOAT_8BYTE); + attr.floats().set(value); + } + + public void setBytes(BytesRef value, Values type) { + setBytes(value, type, null); + + } + + public void setBytes(BytesRef value, Values type, Comparator comp) { + attr.setType(type); + attr.bytes().copy(value); + attr.setBytesComparator(comp); + } + + public ValuesAttribute values() { + return attr; + } + + public T set(T field) { + AttributeSource src = field.getFieldAttributes(); + src.addAttribute(ValuesAttribute.class); + fieldAttributes.copyTo(field.getFieldAttributes()); + return field; + } + + public static ValuesAttribute values(Fieldable fieldable) { + return fieldable.getFieldAttributes().addAttribute(ValuesAttribute.class); + } + + public static T set(T field, Values type) { + if(field instanceof ValuesField) + return field; + final ValuesField valField = new ValuesField(); + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + BytesRef ref = field.isBinary() ? new BytesRef(field.getBinaryValue(), + field.getBinaryOffset(), field.getBinaryLength()) : new BytesRef(field + .stringValue()); + valField.setBytes(ref, type); + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + valField.setInt(Long.parseLong(field.stringValue())); + break; + case SIMPLE_FLOAT_4BYTE: + valField.setFloat(Float.parseFloat(field.stringValue())); + break; + case SIMPLE_FLOAT_8BYTE: + valField.setFloat(Double.parseDouble(field.stringValue())); + break; + default: + throw new IllegalArgumentException("unknown type: " + type); + } + + return valField.set(field); + } +} Property changes on: src/java/org/apache/lucene/document/ValuesField.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 1006266) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -30,6 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.BufferedIndexInput; @@ -41,6 +42,11 @@ import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Values; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close import org.apache.lucene.util.BytesRef; @@ -135,7 +141,7 @@ // Ask codec for its Fields fields = si.getCodec().fieldsProducer(new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor)); assert fields != null; - + openIndexValuesReaders(cfsDir, si); success = true; } finally { if (!success) { @@ -150,6 +156,57 @@ this.origInstance = origInstance; } + final Map indexValues = new HashMap(); + + // Only opens files... doesn't actually load any values + private void openIndexValuesReaders(Directory dir, SegmentInfo si) throws IOException { + final int numFields = fieldInfos.size(); + for(int i=0;i { + final int[] address; + + AddressParallelArray(int size, AtomicLong bytesUsed) { + super(size, bytesUsed); + address = new int[size]; + } + @Override + protected int bytesPerEntry() { + return RamUsageEstimator.NUM_BYTES_INT + super.bytesPerEntry(); + } + + @Override + protected void copyTo(AddressParallelArray toArray, int numToCopy) { + super.copyTo(toArray, numToCopy); + System.arraycopy(address, 0, toArray.address, 0, size); + + } + + @Override + public AddressParallelArray newInstance(int size) { + return new AddressParallelArray(size, bytesUsed); + } + + } + + + static class Writer extends BytesWriterBase { + private int[] docToAddress; + private int address = 1; + + private final ParallelBytesStartArray array = new ParallelBytesStartArray(new AddressParallelArray(0, bytesUsed)); + private final BytesRefHash hash = new BytesRefHash(pool, 16, array) ; + + public Writer(Directory dir, String id) throws IOException { + this(dir, id, new DirectAllocator(ByteBlockPool.BYTE_BLOCK_SIZE), + new AtomicLong()); + } + public Writer(Directory dir, String id, Allocator allocator, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, new ByteBlockPool(allocator), bytesUsed); + docToAddress = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default + if(datOut == null) + initDataOut(); + final int e = hash.add(bytes); + + if (docID >= docToAddress.length) { + final int oldSize = docToAddress.length; + docToAddress = ArrayUtil.grow(docToAddress, 1+docID); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (docToAddress.length - oldSize)); + } + final int docAddress; + if (e >= 0) { + docAddress = array.array.address[e] = address; + address += IOUtils.writeLength(datOut, bytes); + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } else { + docAddress = array.array.address[(-e)-1]; + } + docToAddress[docID] = docAddress; + } + + public long ramBytesUsed() { + return bytesUsed.get(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if(datOut == null) + return; + initIndexOut(); + idxOut.writeInt(address-1); + + // write index + // nocommit -- allow forcing fixed array (not -1) + // TODO(simonw): check the address calculation / make it more intuitive + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, PackedInts.bitsRequired(address-1)); + final int limit; + if (docCount > docToAddress.length) { + limit = docToAddress.length; + } else { + limit = docCount; + } + for(int i=0;i utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); - + public static Comparator getUTF8SortedAsUnicodeComparator() { return utf8SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUnicodeComparator implements Comparator { + @SuppressWarnings("serial") // serializable to work with contrib/remote + private static final class UTF8SortedAsUnicodeComparator implements Serializable, Comparator { // Only singleton private UTF8SortedAsUnicodeComparator() {}; Index: src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java (revision 0) @@ -0,0 +1,344 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectAllocator; +import org.apache.lucene.util.packed.PackedInts; + +// Stores variable-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] and both +// docs reference that single source + +class VarSortedBytesImpl { + + static final String CODEC_NAME = "VarDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int[] docToEntry; + private final Comparator comp; + + private final BytesRefHash hash = new BytesRefHash(pool); + + public Writer(Directory dir, String id, Comparator comp) + throws IOException { + this(dir, id, comp, new DirectAllocator(ByteBlockPool.BYTE_BLOCK_SIZE), + new AtomicLong()); + } + + public Writer(Directory dir, String id, Comparator comp, + Allocator allocator, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, + new ByteBlockPool(allocator), bytesUsed); + this.comp = comp; + docToEntry = new int[1]; + docToEntry[0] = -1; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) + return;// default + if (docID >= docToEntry.length) { + int[] newArray = new int[ArrayUtil.oversize(1 + docID, + RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + Arrays.fill(newArray, docToEntry.length, newArray.length, -1); + bytesUsed.addAndGet((newArray.length - docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + docToEntry = newArray; + } + final int e = hash.add(bytes); + docToEntry[docID] = e < 0 ? (-e) - 1 : e; + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + final int count = hash.size(); + if (count == 0) + return; + initIndexOut(); + initDataOut(); + int[] sortedEntries = hash.sort(comp); + + // first dump bytes data, recording index & offset as + // we go + long offset = 0; + long lastOffset = 0; + final int[] index = new int[count]; + final long[] offsets = new long[count]; + for (int i = 0; i < count; i++) { + final int e = sortedEntries[i]; + offsets[i] = offset; + index[e] = 1 + i; + + final BytesRef bytes = hash.get(e); + // TODO: we could prefix code... + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + lastOffset = offset; + offset += bytes.length; + } + + // total bytes of data + idxOut.writeLong(offset); + + // write index -- first doc -> 1+ord + // nocommit -- allow not -1: + final PackedInts.Writer indexWriter = PackedInts.getWriter(idxOut, + docCount, PackedInts.bitsRequired(count)); + final int limit = docCount > docToEntry.length ? docToEntry.length + : docCount; + for (int i = 0; i < limit; i++) { + final int e = docToEntry[i]; + indexWriter.add(e == -1 ? 0 : index[e]); + } + for (int i = limit; i < docCount; i++) { + indexWriter.add(0); + } + indexWriter.finish(); + + // next ord (0-based) -> offset + // nocommit -- allow not -1: + PackedInts.Writer offsetWriter = PackedInts.getWriter(idxOut, count, + PackedInts.bitsRequired(lastOffset)); + for (int i = 0; i < count; i++) { + offsetWriter.add(offsets[i]); + } + offsetWriter.finish(); + + super.finish(docCount); + bytesUsed.addAndGet((-docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + + } + } + + public static class Reader extends BytesReaderBase { + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + } + + @Override + public org.apache.lucene.index.values.Reader.Source load() + throws IOException { + return loadSorted(null); + } + + @Override + public SortedSource loadSorted(Comparator comp) + throws IOException { + return new Source(cloneData(), cloneIndex(), comp); + } + + private static class Source extends BytesBaseSortedSource { + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader docToOrdIndex; + private final PackedInts.Reader ordToOffsetIndex; // 0-based + private final long totBytes; + private final int valueCount; + private final LookupResult lookupResult = new LookupResult(); + private final Comparator comp; + + public Source(IndexInput datIn, IndexInput idxIn, + Comparator comp) throws IOException { + super(datIn, idxIn); + totBytes = idxIn.readLong(); + data = new byte[(int) totBytes]; + datIn.readBytes(data, 0, (int) totBytes); + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + bytesRef.bytes = data; + // default byte sort order + this.comp = comp == null ? BytesRef.getUTF8SortedAsUnicodeComparator() + : comp; + + } + + @Override + public BytesRef getByOrd(int ord) { + return ord == 0 ? defaultValue : deref(--ord); + } + + @Override + public int ord(int docID) { + return (int) docToOrdIndex.get(docID); + } + + @Override + public LookupResult getByValue(BytesRef bytes) { + return binarySearch(bytes, 0, valueCount - 1); + } + + public long ramBytesUsed() { + // TODO(simonw): move ram usage to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + docToOrdIndex + .getBitsPerValue() + * docToOrdIndex.getBitsPerValue()) + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + ordToOffsetIndex + .getBitsPerValue() + * ordToOffsetIndex.getBitsPerValue()); + } + + @Override + public int getValueCount() { + return valueCount; + } + + // ord is 0-based + private BytesRef deref(int ord) { + bytesRef.offset = (int) ordToOffsetIndex.get(ord); + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + bytesRef.length = (int) (nextOffset - bytesRef.offset); + return bytesRef; + } + + // TODO: share w/ FixedSortedBytesValues? + private LookupResult binarySearch(BytesRef b, int low, int high) { + + while (low <= high) { + int mid = (low + high) >>> 1; + deref(mid); + final int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid + 1; + lookupResult.found = true; + return lookupResult; + } + } + assert comp.compare(bytesRef, b) != 0; + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarSortedBytesEnum(source, cloneData(), cloneIndex()); + } + + private static class VarSortedBytesEnum extends ValuesEnum { + + private PackedInts.Reader docToOrdIndex; + private PackedInts.Reader ordToOffsetIndex; + private IndexInput idxIn; + private IndexInput datIn; + private final BytesRef bytesRef; + private int valueCount; + private long totBytes; + private int docCount; + private int pos = -1; + private final long fp; + + protected VarSortedBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, Values.BYTES_VAR_SORTED); + bytesRef = attr.bytes(); + totBytes = idxIn.readLong(); + // keep that in memory to prevent lots of disk seeks + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + docCount = docToOrdIndex.size(); + fp = datIn.getFilePointer(); + this.idxIn = idxIn; + this.datIn = datIn; + } + + @Override + public void close() throws IOException { + idxIn.close(); + datIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= docCount) + return pos = NO_MORE_DOCS; + final int ord = (int) docToOrdIndex.get(target) - 1; + if (ord == -1) { + bytesRef.length = 0; + bytesRef.offset = 0; + return pos = target; + } + final long offset = ordToOffsetIndex.get(ord); + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + final int length = (int) (nextOffset - offset); + datIn.seek(fp + offset); + if (bytesRef.bytes.length < length) + bytesRef.grow(length); + datIn.readBytes(bytesRef.bytes, 0, length); + bytesRef.length = length; + bytesRef.offset = 0; + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 1006266) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter; + import java.io.IOException; import java.util.Arrays; import java.util.List; @@ -31,6 +32,7 @@ import java.util.HashSet; import java.util.HashMap; import java.util.ArrayList; +import java.util.regex.Pattern; /** * Information about a segment such as it's name, directory, and files related @@ -476,7 +478,12 @@ if (delFileName != null && (delGen >= YES || dir.fileExists(delFileName))) { fileSet.add(delFileName); } - + //nocommit - is there a better way to get all the dat / idx files? + for(String file : dir.listAll()) { + if(file.startsWith(name) && (file.endsWith("dat") || file.endsWith("idx"))){ + fileSet.add(file); + } + } if (normGen != null) { for (int i = 0; i < normGen.length; i++) { long gen = normGen[i]; Index: src/java/org/apache/lucene/util/ByteBlockPool.java =================================================================== --- src/java/org/apache/lucene/util/ByteBlockPool.java (revision 1006266) +++ src/java/org/apache/lucene/util/ByteBlockPool.java (working copy) @@ -62,6 +62,18 @@ return new byte[blockSize]; } } + + public static class DirectAllocator extends Allocator { + + public DirectAllocator(int blockSize) { + super(blockSize); + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + } + + } public byte[][] buffers = new byte[10][]; Index: src/java/org/apache/lucene/index/values/Writer.java =================================================================== --- src/java/org/apache/lucene/index/values/Writer.java (revision 0) +++ src/java/org/apache/lucene/index/values/Writer.java (revision 0) @@ -0,0 +1,92 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +public abstract class Writer { + + /** Records the specfied value for the docID */ + public void add(int docID, long value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + public void add(int docID, double value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + public void add(int docID, BytesRef value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + protected abstract void add(int docID) throws IOException; + + protected abstract void setNextAttribute(ValuesAttribute attr); + + /** Finish writing, close any files */ + public abstract void finish(int docCount) throws IOException; + + public static class MergeState { + public final Reader reader; + public final int docBase; + public final int docCount; + public final Bits bits; + + public MergeState(Reader reader, int docBase, int docCount, Bits bits) { + assert reader != null; + this.reader = reader; + this.docBase = docBase; + this.docCount = docCount; + this.bits = bits; + } + } + + public void add(List states) throws IOException { + for (MergeState state : states) { + merge(state); + } + } + + // enables bulk copies in subclasses per MergeState + protected void merge(MergeState state) throws IOException { + final ValuesEnum valEnum = state.reader.getEnum(); + assert valEnum != null; + try { + final ValuesAttribute attr = valEnum.addAttribute(ValuesAttribute.class); + setNextAttribute(attr); + int docID = state.docBase; + final Bits bits = state.bits; + final int docCount = state.docCount; + for (int i = 0; i < docCount; i++) { + if (bits == null || !bits.get(i)) { + if (valEnum.advance(i) == ValuesEnum.NO_MORE_DOCS) + break; + add(docID++); + } + } + } finally { + valEnum.close(); + } + } +} Property changes on: src/java/org/apache/lucene/index/values/Writer.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/ValuesAttribute.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesAttribute.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesAttribute.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.index.values; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Comparator; + +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; + +public interface ValuesAttribute extends Attribute { + public Values type(); + public BytesRef bytes(); + public FloatsRef floats(); + public LongsRef ints(); + public void setType(Values type); + public Comparator bytesComparator(); + public void setBytesComparator(Comparator comp); + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/ValuesAttribute.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/DocFieldProcessor.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessor.java (revision 1006266) +++ src/java/org/apache/lucene/index/DocFieldProcessor.java (working copy) @@ -17,8 +17,19 @@ * limitations under the License. */ +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.ValuesAttribute; +import org.apache.lucene.index.values.Writer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; + import java.io.IOException; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.HashMap; @@ -37,6 +48,153 @@ final FieldInfos fieldInfos = new FieldInfos(); final DocFieldConsumer consumer; final StoredFieldsWriter fieldsWriter; + final private Map indexValues = new HashMap(); + + synchronized IndexValuesProcessor getProcessor(Directory dir, String segment, String name, ValuesAttribute attr, FieldInfo fieldInfo) + throws IOException { + if(attr == null) + return null; + IndexValuesProcessor p = indexValues.get(name); + if (p == null) { + org.apache.lucene.index.values.Values v = attr.type(); + final String id = segment + "_" + fieldInfo.number; + switch(v) { + case PACKED_INTS: + p = new IntValuesProcessor(dir, id, false); + break; + case PACKED_INTS_FIXED: + p = new IntValuesProcessor(dir, id, true); + break; + case SIMPLE_FLOAT_4BYTE: + p = new FloatValuesProcessor(dir, id, 4); + break; + case SIMPLE_FLOAT_8BYTE: + p = new FloatValuesProcessor(dir, id, 8); + break; + case BYTES_FIXED_STRAIGHT: + p = new BytesValuesProcessor(dir, id, true, null, Bytes.Mode.STRAIGHT); + break; + case BYTES_FIXED_DEREF: + p = new BytesValuesProcessor(dir, id, true, null, Bytes.Mode.DEREF); + break; + case BYTES_FIXED_SORTED: + p = new BytesValuesProcessor(dir, id, true, attr.bytesComparator(), Bytes.Mode.SORTED); + break; + case BYTES_VAR_STRAIGHT: + p = new BytesValuesProcessor(dir, id, false, null, Bytes.Mode.STRAIGHT); + break; + case BYTES_VAR_DEREF: + p = new BytesValuesProcessor(dir, id, false, null, Bytes.Mode.DEREF); + break; + case BYTES_VAR_SORTED: + p = new BytesValuesProcessor(dir, id, false, attr.bytesComparator(), Bytes.Mode.SORTED); + break; + } + fieldInfo.setIndexValues(v); + indexValues.put(name, p); + } + + return p; + } + + static abstract class IndexValuesProcessor { + public abstract void add(int docID, String name, ValuesAttribute attr) throws IOException; + public abstract void finish(int docCount) throws IOException; + public abstract void files(Collection files) throws IOException; + } + + static class FloatValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + + public FloatValuesProcessor(Directory dir, String id, int precision) throws IOException { + this.id = id; + writer = Floats.getWriter(dir, id, precision); + } + + @Override + public void add(int docID, String name, ValuesAttribute attr) throws IOException { + final FloatsRef floats = attr.floats(); + if(floats != null) { + writer.add(docID, floats.get()); + return; + } + throw new IllegalArgumentException("could not extract float/double from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) { + Floats.files(id, files); + } + } + + static class IntValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + + public IntValuesProcessor(Directory dir, String id, boolean fixedArray) throws IOException { + this.id = id; + writer = Ints.getWriter(dir, id, fixedArray); + } + + @Override + public void add(int docID, String name, ValuesAttribute attr) throws IOException { + final LongsRef ints = attr.ints(); + if(ints != null) { + writer.add(docID, ints.get()); + return; + } + throw new IllegalArgumentException("could not extract int/long from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) throws IOException { + Ints.files(id, files); + } + } + + static class BytesValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + private final Directory dir; + + public BytesValuesProcessor(Directory dir, String id, boolean fixedSize, Comparator comp, Bytes.Mode mode) throws IOException { + this.id = id; + writer = Bytes.getWriter(dir, id, mode,comp, fixedSize); + this.dir = dir; + } + + // nocommit -- make this thread private and not sync'd + @Override + public synchronized void add(int docID, String name, ValuesAttribute attr) throws IOException { + final BytesRef bytes = attr.bytes(); + if(bytes != null) { + writer.add(docID, bytes); + return; + } + throw new IllegalArgumentException("could not extract byte[] from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) throws IOException { + Bytes.files(dir, id, files); + } + } public DocFieldProcessor(DocumentsWriter docWriter, DocFieldConsumer consumer) { this.docWriter = docWriter; @@ -63,6 +221,14 @@ fieldsWriter.flush(state); consumer.flush(childThreadsAndFields, state); + for(IndexValuesProcessor p : indexValues.values()) { + if (p != null) { + p.finish(state.numDocs); + p.files(state.flushedFiles); + } + } + indexValues.clear(); + // Important to save after asking consumer to flush so // consumer can alter the FieldInfo* if necessary. EG, // FreqProxTermsWriter does this with Index: src/java/org/apache/lucene/index/values/Reader.java =================================================================== --- src/java/org/apache/lucene/index/values/Reader.java (revision 0) +++ src/java/org/apache/lucene/index/values/Reader.java (revision 0) @@ -0,0 +1,109 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Closeable; +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; + +public abstract class Reader implements Closeable { + + + public ValuesEnum getEnum() throws IOException{ + return getEnum(null); + } + + public abstract ValuesEnum getEnum(AttributeSource attrSource) throws IOException; + + public abstract Source load() throws IOException; + + public SortedSource loadSorted(Comparator comparator) throws IOException { + throw new UnsupportedOperationException(); + } + + + /** + * Source of integer (returned as java long), per document. The underlying + * implementation may use different numbers of bits per value; long is only + * used since it can handle all precisions. + */ + public static abstract class Source { + + public long ints(int docID) { + throw new UnsupportedOperationException("ints are not supported"); + } + + public double floats(int docID) { + throw new UnsupportedOperationException("floats are not supported"); + } + + public BytesRef bytes(int docID) { + throw new UnsupportedOperationException("bytes are not supported"); + } + + /** Returns number of unique values. Some impls may + * throw UnsupportedOperationException. */ + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + public ValuesEnum getEnum() throws IOException{ + return getEnum(null); + } + + // nocommit - enable obtaining enum from source since this is already in memory + public /*abstract*/ ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + throw new UnsupportedOperationException(); + } + + public abstract long ramBytesUsed(); + } + + public static abstract class SortedSource extends Source { + + @Override + public BytesRef bytes(int docID) { + return getByOrd(ord(docID)); + } + + /** + * Returns ord for specified docID. If this docID had not been added to the + * Writer, the ord is 0. Ord is dense, ie, starts at 0, then increments by 1 + * for the next (as defined by {@link Comparator} value. + */ + public abstract int ord(int docID); + + /** Returns value for specified ord. */ + public abstract BytesRef getByOrd(int ord); + + public static class LookupResult { + public boolean found; + public int ord; + } + + /** + * Finds the largest ord whose value is <= the requested value. If + * {@link LookupResult#found} is true, then ord is an exact match. The + * returned {@link LookupResult} may be reused across calls. + */ + public abstract LookupResult getByValue(BytesRef value); + } + +} Property changes on: src/java/org/apache/lucene/index/values/Reader.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java (revision 0) @@ -0,0 +1,221 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; + +// Simplest storage: stores fixed length byte[] per +// document, with no dedup and no sorting. + +class FixedStraightBytesImpl { + + static final String CODEC_NAME = "FixedStraightBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + // start at -1 if the first added value is > 0 + private int lastDocID = -1; + private byte[] oneRecord; + + protected Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false, null, null); + } + + // nocommit - impl bulk copy here! + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + oneRecord = new byte[size]; + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + fill(docID); + assert bytes.bytes.length >= bytes.length; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + /* (non-Javadoc) + * @see org.apache.lucene.index.values.Writer#merge(org.apache.lucene.index.values.Writer.MergeState) + */ + @Override + protected void merge(MergeState state) throws IOException { + if(state.bits == null && state.reader instanceof Reader){ + Reader reader = (Reader) state.reader; + final int maxDocs = reader.maxDoc; + if(maxDocs == 0) + return; + if(size == -1) { + size = reader.size; + initDataOut(); + datOut.writeInt(size); + oneRecord = new byte[size]; + } + fill(state.docBase); + // nocommit should we add a transfer to API to each reader? + datOut.copyBytes(reader.cloneData(), size * maxDocs); + lastDocID += maxDocs-1; + } else + super.merge(state); + } + + // Fills up to but not including this docID + private void fill(int docID) throws IOException { + assert size >= 0; + for(int i=lastDocID+1;i= maxDoc){ + ref.length = 0; + ref.offset = 0; + return pos = NO_MORE_DOCS; + } + if((target-1) != pos) // pos inc == 1 + datIn.seek(fp + target * size); + datIn.readBytes(ref.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- src/java/org/apache/lucene/search/FieldComparator.java (revision 1006266) +++ src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -22,8 +22,9 @@ import java.util.Locale; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.FieldCache.DocTermsIndex; +import org.apache.lucene.index.values.Reader.Source; import org.apache.lucene.search.FieldCache.DocTerms; +import org.apache.lucene.search.FieldCache.DocTermsIndex; import org.apache.lucene.search.cache.ByteValuesCreator; import org.apache.lucene.search.cache.CachedArray; import org.apache.lucene.search.cache.CachedArrayCreator; @@ -39,9 +40,9 @@ import org.apache.lucene.search.cache.CachedArray.LongValues; import org.apache.lucene.search.cache.CachedArray.ShortValues; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; +import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.PackedInts; /** @@ -159,7 +160,6 @@ * comparators can just return "this" to reuse the same * comparator across segments * @throws IOException - * @throws IOException */ public abstract FieldComparator setNextReader(IndexReader reader, int docBase) throws IOException; @@ -309,6 +309,65 @@ } } + /** Uses float index values to sort by ascending value */ + public static final class FloatIndexValuesComparator extends FieldComparator { + private final double[] values; + private Source currentReaderValues; + private final String field; + private double bottom; + + FloatIndexValuesComparator(int numHits, String field) { + values = new double[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + final double v1 = values[slot1]; + final double v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + final double v2 = currentReaderValues.floats(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.floats(doc); + } + + @Override + public FieldComparator setNextReader(IndexReader reader, int docBase) throws IOException { + currentReaderValues = reader.getIndexValuesCache().getFloats(field); + return this; + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Double.valueOf(values[slot]); + } + } + /** Parses field's values as float (using {@link * FieldCache#getFloats} and sorts by ascending value */ public static final class FloatComparator extends NumericComparator { @@ -448,6 +507,69 @@ } } + /** Loads int index values and sorts by ascending value. */ + public static final class IntIndexValuesComparator extends FieldComparator { + private final long[] values; + private Source currentReaderValues; + private final String field; + private long bottom; + + IntIndexValuesComparator(int numHits, String field) { + values = new long[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v1 = values[slot1]; + final long v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v2 = currentReaderValues.ints(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.ints(doc); + } + + @Override + public FieldComparator setNextReader(IndexReader reader, int docBase) throws IOException { + currentReaderValues = reader.getIndexValuesCache().getInts(field); + return this; + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Long.valueOf(values[slot]); + } + } + /** Parses field's values as long (using {@link * FieldCache#getLongs} and sorts by ascending value */ public static final class LongComparator extends NumericComparator { Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (revision 1006266) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (working copy) @@ -75,8 +75,7 @@ } else { throw new RuntimeException("You must specify the sort type ie page:int,subject:string"); } - int type = getType(typeString); - sortField0 = new SortField(fieldName, type); + sortField0 = getSortField(fieldName, typeString); } sortFields[upto++] = sortField0; } @@ -86,12 +85,26 @@ System.arraycopy(sortFields, 0, newSortFields, 0, upto); sortFields = newSortFields; } + this.sort = new Sort(sortFields); } - private int getType(String typeString) { - int type; - if (typeString.equals("float")) { + private SortField getSortField(String fieldName, String typeString) { + boolean useIndexValues = false; + int type = -1; + if (typeString.equals("intvalues")) { + useIndexValues = true; + type = SortField.INT; + } else if (typeString.equals("floatvalues")) { + useIndexValues = true; + type = SortField.FLOAT; + } else if (typeString.equals("stringvalues")) { + useIndexValues = true; + type = SortField.STRING; + } else if (typeString.equals("bytesvalues")) { + useIndexValues = true; + type = SortField.BYTES; + } else if (typeString.equals("float")) { type = SortField.FLOAT; } else if (typeString.equals("double")) { type = SortField.DOUBLE; @@ -110,7 +123,10 @@ } else { throw new RuntimeException("Unrecognized sort field type " + typeString); } - return type; + + SortField f = new SortField(fieldName, type); + f.setUseIndexValues(useIndexValues); + return f; } @Override Index: src/test/org/apache/lucene/index/TestIndexWriterConfig.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterConfig.java (revision 1006266) +++ src/test/org/apache/lucene/index/TestIndexWriterConfig.java (working copy) @@ -47,7 +47,7 @@ // Does not implement anything - used only for type checking on IndexWriterConfig. @Override - DocConsumer getChain(DocumentsWriter documentsWriter) { + public DocConsumer getChain(DocumentsWriter documentsWriter) { return null; } Index: src/java/org/apache/lucene/index/FieldsEnum.java =================================================================== --- src/java/org/apache/lucene/index/FieldsEnum.java (revision 1006266) +++ src/java/org/apache/lucene/index/FieldsEnum.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; +import org.apache.lucene.index.values.ValuesEnum; import org.apache.lucene.util.AttributeSource; /** Enumerates indexed fields. You must first call {@link @@ -55,7 +56,7 @@ * null this method should not be called. This method * will not return null. */ public abstract TermsEnum terms() throws IOException; - + public final static FieldsEnum[] EMPTY_ARRAY = new FieldsEnum[0]; /** Provides zero fields */ Index: src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- src/java/org/apache/lucene/index/FieldInfo.java (revision 1006266) +++ src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.index.values.Values; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -22,6 +24,8 @@ public String name; public boolean isIndexed; public int number; + Values indexValues; + // true if term vector for this field should be stored boolean storeTermVector; @@ -88,4 +92,18 @@ } } } + + void setIndexValues(Values v) { + if (indexValues != null) { + if (indexValues != v) { + throw new IllegalArgumentException("indexValues is already set to " + indexValues + "; cannot change to " + v); + } + } else{ + indexValues = v; + } + } + + Values getIndexValues() { + return indexValues; + } } Index: src/java/org/apache/lucene/index/values/ValuesEnum.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesEnum.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesEnum.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; + +public abstract class ValuesEnum extends DocIdSetIterator{ + private AttributeSource source; + protected final ValuesAttribute attr; + + + protected ValuesEnum(Values enumType) { + this(null, enumType); + } + + protected ValuesEnum(AttributeSource source, Values enumType) { + this.source = source; + boolean setType = !hasAttribute(ValuesAttribute.class); + attr = addAttribute(ValuesAttribute.class); + if (setType) + attr.setType(enumType); + } + + public AttributeSource attributes() { + if (source == null) + source = new AttributeSource(); + return source; + } + + public T addAttribute(Class attr) { + return attributes().addAttribute(attr); + } + + public T getAttribute(Class attr) { + return attributes().getAttribute(attr); + } + + public boolean hasAttribute(Class attr) { + return attributes().hasAttribute(attr); + } + + public abstract void close() throws IOException; + +} Property changes on: src/java/org/apache/lucene/index/values/ValuesEnum.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/DirectoryReader.java =================================================================== --- src/java/org/apache/lucene/index/DirectoryReader.java (revision 1006266) +++ src/java/org/apache/lucene/index/DirectoryReader.java (working copy) @@ -36,7 +36,14 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.ValuesEnum; +import org.apache.lucene.index.values.Reader.Source; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.BytesRef; @@ -990,7 +997,264 @@ return commits; } + + public Reader getIndexValues(String field) { + ensureOpen(); + if (subReaders.length == 1) { + return subReaders[0].getIndexValues(field); + } + return new MultiValueReader(field); + } + + private class MultiValueReader extends Reader { + + private String id; + private Values value; + + public MultiValueReader(String id) { + this.id = id; + for (SegmentReader reader : subReaders) { + FieldInfo fieldInfo = reader.fieldInfos().fieldInfo(id); + if(fieldInfo != null){ + value = fieldInfo.getIndexValues(); + break; + } + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new MultiValuesEnum(id, value); + } + + @Override + public Source load() throws IOException { + return new MultiSource(id); + } + + public void close() throws IOException { + // + } + + } + + private class MultiValuesEnum extends ValuesEnum { + private int numDocs_ = 0; + private int pos = -1; + private int start = 0; + private final String id; + private final ValuesEnum[] enumCache; + private ValuesEnum current; + + protected MultiValuesEnum(String id, Values enumType) { + super(enumType); + enumCache = new ValuesEnum[subReaders.length]; + this.id = id; + } + + @Override + public void close() throws IOException { + for (ValuesEnum valuesEnum : enumCache) { + if(valuesEnum != null) + valuesEnum.close(); + } + } + + @Override + public int advance( int target) throws IOException { + int n = target - start; + do { + if(target >= maxDoc) + return pos = NO_MORE_DOCS; + if (n >= numDocs_) { + int idx = readerIndex(target); + if (enumCache[idx] == null) { + try { + Reader indexValues = subReaders[idx].getIndexValues(id); + if (indexValues != null) // nocommit does that work with default + // values? + enumCache[idx] = indexValues.getEnum(this.attributes()); + else + enumCache[idx] = new DummyEnum(this.attributes(), + subReaders[idx].maxDoc(), attr.type()); + } catch (IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + } + current = enumCache[idx]; + start = starts[idx]; + numDocs_ = subReaders[idx].maxDoc(); + n = target - start; + } + target = start+numDocs_; + } while ((n = current.advance(n)) == NO_MORE_DOCS); + return pos = start+current.docID(); + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + + private class MultiSource extends Source { + private int numDocs_ = 0; + private int start = 0; + private Source current; + private final String id; + + MultiSource(String id) { + this.id = id; + } + + public long ints(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getInts(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + start = starts[idx]; + numDocs_ = subReaders[idx].maxDoc(); + n = docID - start; + } + return current.ints(n); + } + + public double floats(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getFloats(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + numDocs_ = subReaders[idx].maxDoc(); + + start = starts[idx]; + n = docID - start; + } + return current.floats(n); + } + + public BytesRef bytes(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getBytes(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + numDocs_ = subReaders[idx].maxDoc(); + start = starts[idx]; + n = docID - start; + } + return current.bytes(n); + } + + public long ramBytesUsed() { + return current.ramBytesUsed(); + } + + } + + private static class DummySource extends Source { + private final BytesRef ref = new BytesRef(); + @Override + public BytesRef bytes(int docID) { + return ref; + } + + + @Override + public double floats(int docID) { + return 0.0d; + } + + @Override + public long ints(int docID) { + return 0; + } + + public long ramBytesUsed() { + return 0; + } + } + + private static class DummyEnum extends ValuesEnum { + private int pos = -1; + private final int maxDoc; + + public DummyEnum(AttributeSource source, int maxDoc, Values type) { + super(source, type); + this.maxDoc = maxDoc; + switch (type) { + case BYTES_VAR_STRAIGHT: + case BYTES_FIXED_STRAIGHT: + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + // nocommit - this is not correct for Fixed_straight + BytesRef bytes = attr.bytes(); + bytes.length = 0; + bytes.offset = 0; + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + LongsRef ints = attr.ints(); + ints.set(0); + break; + + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + FloatsRef floats = attr.floats(); + floats.set(0d); + break; + default: + throw new IllegalArgumentException("unknown Values type: " + type); + } + } + @Override + public void close() throws IOException { + } + + @Override + public int advance(int target) throws IOException { + return pos = (pos < maxDoc ? target: NO_MORE_DOCS); + } + @Override + public int docID() { + return pos; + } + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + + } + + private static final class ReaderCommit extends IndexCommit { private String segmentsFileName; Collection files; Index: src/java/org/apache/lucene/util/LongsRef.java =================================================================== --- src/java/org/apache/lucene/util/LongsRef.java (revision 0) +++ src/java/org/apache/lucene/util/LongsRef.java (revision 0) @@ -0,0 +1,91 @@ +/** + * + */ +package org.apache.lucene.util; + + +public final class LongsRef implements Cloneable { + public long[] ints; + public int offset; + public int length; + + public LongsRef() { + } + + public LongsRef(int capacity) { + ints = new long[capacity]; + } + + public LongsRef(long[] ints, int offset, int length) { + this.ints = ints; + this.offset = offset; + this.length = length; + } + + public LongsRef(LongsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new LongsRef(this); + } + + public void set(long value) { + ints[offset] = value; + } + + public long get() { + return ints[offset]; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i = offset; i < end; i++) { + long value = ints[i]; + result = prime * result + (int) (value ^ (value >>> 32)); + } + return result; + } + + @Override + public boolean equals(Object other) { + return this.intsEquals((LongsRef) other); + } + + public boolean intsEquals(LongsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final long[] otherInts = other.ints; + final int end = offset + length; + for(int upto=offset;upto + * NOTE: The total amount of byte[] data stored (across a single segment) cannot + * exceed 2GB. + *

+ *

+ * NOTE: Each byte[] must be <= 32768 bytes in length + *

+ */ +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public final class Bytes { + + // don't instantiate! + private Bytes() { + } + + public static enum Mode { + STRAIGHT, DEREF, SORTED + }; + + public static void files(Directory dir, String id, Collection files) + throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + final String idxFile = IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION); + if (dir.fileExists(idxFile)) { + files.add(idxFile); + } + } + + // nocommit -- i shouldn't have to specify fixed? can + // track itself & do the write thing at write time? + public static Writer getWriter(Directory dir, String id, Mode mode, + Comparator comp, boolean fixedSize) throws IOException { + + if (comp == null) { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesImpl.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesImpl.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Writer(dir, id, comp); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Writer(dir, id, comp); + } + } + + throw new IllegalArgumentException(""); + } + + // nocommit -- I can peek @ header to determing fixed/mode? + public static Reader getReader(Directory dir, String id, Mode mode, + boolean fixedSize, int maxDoc) throws IOException { + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + try { + return new FixedStraightBytesImpl.Reader(dir, id, maxDoc); + } catch (IOException e) { + throw e; + } + } else if (mode == Mode.DEREF) { + try { + return new FixedDerefBytesImpl.Reader(dir, id, maxDoc); + } catch (IOException e) { + throw e; + } + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Reader(dir, id, maxDoc); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Reader(dir, id, maxDoc); + } + } + + throw new IllegalArgumentException(""); + } + + static abstract class BytesBaseSource extends Source { + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final BytesRef defaultValue = new BytesRef(); + + protected BytesBaseSource(IndexInput datIn, IndexInput idxIn) { + this.datIn = datIn; + this.idxIn = idxIn; + } + + public void close() throws IOException { + if (datIn != null) + datIn.close(); + if (idxIn != null) // if straight + idxIn.close(); + + } + } + + static abstract class BytesBaseSortedSource extends SortedSource { + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final BytesRef defaultValue = new BytesRef(); + + protected BytesBaseSortedSource(IndexInput datIn, IndexInput idxIn) { + this.datIn = datIn; + this.idxIn = idxIn; + } + + public void close() throws IOException { + if (datIn != null) + datIn.close(); + if (idxIn != null) // if straight + idxIn.close(); + + } + } + + static abstract class BytesWriterBase extends Writer { + + private final Directory dir; + private final String id; + protected IndexOutput idxOut; + protected IndexOutput datOut; + protected BytesRef bytesRef; + private String codecName; + private int version; + protected final ByteBlockPool pool; + protected final AtomicLong bytesUsed; + + protected BytesWriterBase(Directory dir, String id, String codecName, + int version, boolean initIndex, boolean initData, ByteBlockPool pool, AtomicLong bytesUsed) throws IOException { + this.dir = dir; + this.id = id; + this.codecName = codecName; + this.version = version; + this.pool = pool; + this.bytesUsed = bytesUsed; + if (initData) + initDataOut(); + if (initIndex) + initIndexOut(); + } + + protected void initDataOut() throws IOException { + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, codecName, version); + } + + protected void initIndexOut() throws IOException { + idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION)); + CodecUtil.writeHeader(idxOut, codecName, version); + } + + public long ramBytesUsed() { + return bytesUsed.get(); + } + + /** + * Must be called only with increasing docIDs. It's OK for some docIDs to be + * skipped; they will be filled with 0 bytes. + */ + @Override + public abstract void add(int docID, BytesRef bytes) throws IOException; + + @Override + public synchronized void finish(int docCount) throws IOException { + if (datOut != null) + datOut.close(); + if (idxOut != null) + idxOut.close(); + if(pool != null) + pool.reset(); + } + + @Override + protected void add(int docID) throws IOException { + add(docID, bytesRef); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + bytesRef = attr.bytes(); + assert bytesRef != null; + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static abstract class BytesReaderBase extends Reader { + protected final IndexInput idxIn; + protected final IndexInput datIn; + protected final int version; + protected final String id; + + protected BytesReaderBase(Directory dir, String id, String codecName, + int maxVersion, boolean doIndex) throws IOException { + this.id = id; + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + version = CodecUtil.checkHeader(datIn, codecName, maxVersion, maxVersion); + + if (doIndex) { + idxIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION)); + final int version2 = CodecUtil.checkHeader(idxIn, codecName, + maxVersion, maxVersion); + assert version == version2; + } else { + idxIn = null; + } + } + + protected final IndexInput cloneData() { + assert !isClosed.get():printEx(); + // is never NULL + return (IndexInput) datIn.clone(); + } + + protected final IndexInput cloneIndex() { + assert !isClosed.get():printEx(); + return idxIn == null ? null : (IndexInput) idxIn.clone(); + } + private final AtomicBoolean isClosed = new AtomicBoolean(false); + Exception ex; + public void close() throws IOException { + assert !isClosed.getAndSet(true); + ex =new Exception(); + if (datIn != null) { + datIn.close(); + } + if (idxIn != null) { + idxIn.close(); + } + } + + private String printEx() { + ex.printStackTrace(); + return ex.getMessage(); + } + } + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/Bytes.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java =================================================================== --- src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java (revision 1006266) +++ src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java (working copy) @@ -61,7 +61,7 @@ int indexInterval = 128; /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * used to accelerate {@link TermDocs#advance(int)}. Larger values result in * smaller indexes, greater acceleration, but fewer accelerable cases, while * smaller values result in bigger indexes, less acceleration and more * accelerable cases. More detailed experiments would be useful here. */ Index: src/java/org/apache/lucene/index/FieldInfos.java =================================================================== --- src/java/org/apache/lucene/index/FieldInfos.java (revision 1006266) +++ src/java/org/apache/lucene/index/FieldInfos.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.values.Values; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -39,8 +40,11 @@ // First used in 2.9; prior to 2.9 there was no format header public static final int FORMAT_START = -2; + // Records index values for this field + public static final int FORMAT_INDEX_VALUES = -3; + // whenever you add a new format, make it 1 smaller (negative version logic)! - static final int FORMAT_CURRENT = FORMAT_START; + static final int FORMAT_CURRENT = FORMAT_INDEX_VALUES; static final int FORMAT_MINIMUM = FORMAT_START; @@ -301,9 +305,51 @@ if (fi.omitNorms) bits |= OMIT_NORMS; if (fi.storePayloads) bits |= STORE_PAYLOADS; if (fi.omitTermFreqAndPositions) bits |= OMIT_TERM_FREQ_AND_POSITIONS; - + output.writeString(fi.name); output.writeByte(bits); + + final byte b; + + if (fi.indexValues == null) { + b = 0; + } else { + switch(fi.indexValues) { + case PACKED_INTS: + b = 1; + break; + case SIMPLE_FLOAT_4BYTE: + b = 2; + break; + case SIMPLE_FLOAT_8BYTE: + b = 3; + break; + case BYTES_FIXED_STRAIGHT: + b = 4; + break; + case BYTES_FIXED_DEREF: + b = 5; + break; + case BYTES_FIXED_SORTED: + b = 6; + break; + case BYTES_VAR_STRAIGHT: + b = 7; + break; + case BYTES_VAR_DEREF: + b = 8; + break; + case BYTES_VAR_SORTED: + b = 9; + break; + case PACKED_INTS_FIXED: + b = 10; + break; + default: + throw new IllegalStateException("unhandled indexValues type " + fi.indexValues); + } + } + output.writeByte(b); } } @@ -330,7 +376,49 @@ boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0; - addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + FieldInfo fi = addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + + if (format <= FORMAT_INDEX_VALUES) { + final byte b = input.readByte(); + + switch(b) { + case 0: + fi.indexValues = null; + break; + case 1: + fi.indexValues = Values.PACKED_INTS; + break; + case 2: + fi.indexValues = Values.SIMPLE_FLOAT_4BYTE; + break; + case 3: + fi.indexValues = Values.SIMPLE_FLOAT_8BYTE; + break; + case 4: + fi.indexValues = Values.BYTES_FIXED_STRAIGHT; + break; + case 5: + fi.indexValues = Values.BYTES_FIXED_DEREF; + break; + case 6: + fi.indexValues = Values.BYTES_FIXED_SORTED; + break; + case 7: + fi.indexValues = Values.BYTES_VAR_STRAIGHT; + break; + case 8: + fi.indexValues = Values.BYTES_VAR_DEREF; + break; + case 9: + fi.indexValues = Values.BYTES_VAR_SORTED; + break; + case 10: + fi.indexValues = Values.PACKED_INTS_FIXED; + break; + default: + throw new IllegalStateException("unhandled indexValues type " + b); + } + } } if (input.getFilePointer() != input.length()) { Index: contrib/benchmark/conf/sort-standard.alg =================================================================== --- contrib/benchmark/conf/sort-standard.alg (revision 1006266) +++ contrib/benchmark/conf/sort-standard.alg (working copy) @@ -26,6 +26,7 @@ directory=FSDirectory #directory=RamDirectory +doc.index.props=true doc.stored=true doc.tokenized=true doc.term.vector=false @@ -66,6 +67,4 @@ } : 4 } - -RepSumByName - +RepSumByName \ No newline at end of file Index: src/java/org/apache/lucene/document/AbstractField.java =================================================================== --- src/java/org/apache/lucene/document/AbstractField.java (revision 1006266) +++ src/java/org/apache/lucene/document/AbstractField.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.search.spans.SpanQuery; // for javadocs import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.StringHelper; // for javadocs @@ -292,4 +293,16 @@ result.append('>'); return result.toString(); } + private AttributeSource source; + + public boolean hasFieldAttribute() { + return source != null; + } + + public AttributeSource getFieldAttributes() { + if(source == null) + source = new AttributeSource(); + return source; + } + } Index: src/java/org/apache/lucene/index/values/PackedIntsImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/PackedIntsImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/PackedIntsImpl.java (revision 0) @@ -0,0 +1,240 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +/** Stores ints packed with fixed-bit precision. */ +class PackedIntsImpl { + + private static final String CODEC_NAME = "PackedInts"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class IntsWriter extends Writer { + // nocommit - can we bulkcopy this on a merge? + private LongsRef intsRef; + private long[] docToValue; + private long minValue; + private long maxValue; + private boolean started; + private final Directory dir; + private final String id; + private int maxDocID; + private int minDocID; + + protected IntsWriter(Directory dir, String id) throws IOException { + this.dir = dir; + this.id = id; + docToValue = new long[1]; + } + + @Override + synchronized public void add(int docID, long v) throws IOException { + + if (!started) { + minValue = maxValue = v; + minDocID = maxDocID = docID; + started = true; + + } else { + if (v < minValue) { + minValue = v; + } else if (v > maxValue) { + maxValue = v; + } + if (docID < minDocID) { + minDocID = docID; + } else if (docID > maxDocID) { + maxDocID = docID; + } + } + if (docID >= docToValue.length) { + docToValue = ArrayUtil.grow(docToValue, 1 + docID); + } + docToValue[docID] = v; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if(!started) + return; + final IndexOutput datOut = dir.createOutput(IndexFileNames + .segmentFileName(id, "", IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + + // nocommit -- long can't work right since it's signed + datOut.writeLong(minValue); + // write a default value to recognize docs without a value for that field + final long defaultValue = ++maxValue - minValue; + datOut.writeLong(defaultValue); + PackedInts.Writer w = PackedInts.getWriter(datOut, docCount, PackedInts.bitsRequired(maxValue-minValue)); + + final int limit = maxDocID + 1; + for (int i = 0; i < minDocID; i++) { + w.add(defaultValue); + } + for (int i = minDocID; i < limit; i++) { + w.add(docToValue[i] - minValue); + } + for (int i = limit; i < docCount; i++) { + w.add(defaultValue); + } + w.finish(); + + datOut.close(); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + docToValue.length + * RamUsageEstimator.NUM_BYTES_LONG; + } + + @Override + protected void add(int docID) throws IOException { + add(docID, intsRef.get()); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + intsRef = attr.ints(); + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class IntsReader extends Reader { + private final IndexInput datIn; + + protected IntsReader(Directory dir, String id) throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + return new IntsSource((IndexInput) datIn.clone()); + } + + private static class IntsSource extends Source { + private final long minValue; + private final long defaultValue; + private final PackedInts.Reader values; + + public IntsSource(IndexInput dataIn) throws IOException { + dataIn.seek(CodecUtil.headerLength(CODEC_NAME)); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + values = PackedInts.getReader(dataIn); + } + + @Override + public long ints(int docID) { + // nocommit -- can we somehow avoid 2X method calls + // on each get? must push minValue down, and make + // PackedInts implement Ints.Source + final long val = values.get(docID); + // docs not having a value for that field must return a default value + return val == defaultValue ? 0 : minValue + val; + } + + public long ramBytesUsed() { + // TODO(simonw): move that to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + values.getBitsPerValue() * values.size(); + } + } + + public void close() throws IOException { + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new IntsEnumImpl(source, (IndexInput) datIn.clone()); + } + + } + + private static final class IntsEnumImpl extends ValuesEnum { + private final PackedInts.ReaderIterator ints; + private long minValue; + private final IndexInput dataIn; + private final long defaultValue; + private LongsRef ref; + private final int maxDoc; + private int pos = -1; + + private IntsEnumImpl(AttributeSource source, IndexInput dataIn) + throws IOException { + super(source, Values.PACKED_INTS); + this.ref = attr.ints(); + this.ref.offset = 0; + this.dataIn = dataIn; + dataIn.seek(CodecUtil.headerLength(CODEC_NAME)); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + this.ints = PackedInts.getReaderIterator(dataIn); + maxDoc = ints.size(); + } + + @Override + public void close() throws IOException { + ints.close(); + dataIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + final long val = ints.advance(target); + ref.ints[0] = val == defaultValue? 0:minValue + val; + ref.offset = 0; // can we skip this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/PackedIntsImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/search/ReqOptSumScorer.java =================================================================== --- src/java/org/apache/lucene/search/ReqOptSumScorer.java (revision 1006266) +++ src/java/org/apache/lucene/search/ReqOptSumScorer.java (working copy) @@ -21,7 +21,7 @@ /** A Scorer for queries with a required part and an optional part. * Delays skipTo() on the optional part until a score() is needed. *
- * This Scorer implements {@link Scorer#skipTo(int)}. + * This Scorer implements {@link Scorer#advance(int)}. */ class ReqOptSumScorer extends Scorer { /** The scorers passed from the constructor.