Index: lucene/facet/build.xml =================================================================== --- lucene/facet/build.xml (revision 1444349) +++ lucene/facet/build.xml (working copy) @@ -31,7 +31,7 @@ - + Index: lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java (revision 1444349) +++ lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java (working copy) @@ -18,10 +18,11 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.facet.collections.IntToObjectMap; import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.Facets42Codec; +import org.apache.lucene.facet.params.CategoryListParams.OrdinalPolicy; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetSearchParams; -import org.apache.lucene.facet.params.CategoryListParams.OrdinalPolicy; import org.apache.lucene.facet.search.FacetRequest; import org.apache.lucene.facet.search.FacetResult; import org.apache.lucene.facet.search.FacetResultNode; @@ -33,8 +34,8 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; @@ -180,7 +181,8 @@ /** Returns indexing params for the main index */ protected IndexWriterConfig getIndexWriterConfig(Analyzer analyzer) { - return newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + // nocommit + return newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodec(new Facets42Codec()); } /** Returns a {@link FacetIndexingParams} per the given partition size. */ Index: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesProducer.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesProducer.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesProducer.java (working copy) @@ -0,0 +1,85 @@ +package org.apache.lucene.facet.codecs.facetsdv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; + +class FacetsDocValuesProducer extends DocValuesProducer { + + private final Map fields = new HashMap(); + + FacetsDocValuesProducer(SegmentReadState state) throws IOException { + String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FacetsDocValuesFormat.EXTENSION); + IndexInput in = state.directory.openInput(fileName, state.context); + boolean success = false; + try { + CodecUtil.checkHeader(in, FacetsDocValuesFormat.CODEC, + FacetsDocValuesFormat.VERSION_START, + FacetsDocValuesFormat.VERSION_START); + int fieldNumber = in.readVInt(); + while (fieldNumber != -1) { + fields.put(fieldNumber, new FacetsBinaryDocValues(in)); + fieldNumber = in.readVInt(); + } + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues only implements binary"); + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + return fields.get(field.number); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues only implements binary"); + } + + @Override + public void close() throws IOException { + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesProducer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsBinaryDocValues.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsBinaryDocValues.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsBinaryDocValues.java (working copy) @@ -0,0 +1,45 @@ +package org.apache.lucene.facet.codecs.facetsdv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; + +public class FacetsBinaryDocValues extends BinaryDocValues { + public final byte[] bytes; + public final PackedInts.Reader addresses; + + FacetsBinaryDocValues(DataInput in) throws IOException { + int totBytes = in.readVInt(); + System.out.println("FDV: " + totBytes); + bytes = new byte[totBytes]; + in.readBytes(bytes, 0, totBytes); + addresses = PackedInts.getReader(in); + } + + @Override + public void get(int docID, BytesRef ret) { + int start = (int) addresses.get(docID); + ret.bytes = bytes; + ret.offset = start; + ret.length = (int) (addresses.get(docID+1)-start); + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsBinaryDocValues.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesFormat.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesFormat.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesFormat.java (working copy) @@ -0,0 +1,58 @@ +package org.apache.lucene.facet.codecs.facetsdv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * DocValues format that only handles binary doc values and + * is optimized for usage with facets. It uses more RAM than other + * formats in exchange for faster lookups. + * + * @lucene.experimental + */ + +// nocommit rename to Facets42DVFormat + +public final class FacetsDocValuesFormat extends DocValuesFormat { + + public FacetsDocValuesFormat() { + super("Facets"); + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new FacetsDocValuesConsumer(state); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new FacetsDocValuesProducer(state); + } + + public static final String CODEC = "FacetsDocValues"; + public static final String EXTENSION = "fdv"; + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesFormat.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesConsumer.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesConsumer.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesConsumer.java (working copy) @@ -0,0 +1,114 @@ +package org.apache.lucene.facet.codecs.facetsdv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** writer for {@link FacetsDocValuesFormat} */ +public class FacetsDocValuesConsumer extends DocValuesConsumer { + + final IndexOutput out; + final int maxDoc; + + public FacetsDocValuesConsumer(SegmentWriteState state) throws IOException { + boolean success = false; + try { + String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FacetsDocValuesFormat.EXTENSION); + out = state.directory.createOutput(fileName, state.context); + CodecUtil.writeHeader(out, FacetsDocValuesFormat.CODEC, FacetsDocValuesFormat.VERSION_CURRENT); + maxDoc = state.segmentInfo.getDocCount(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields"); + } + + @Override + public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { + // write the byte[] data + out.writeVInt(field.number); + + long totBytes = 0; + for(BytesRef v : values) { + totBytes += v.length; + } + + if (totBytes > Integer.MAX_VALUE) { + // nocommit fixme: + throw new IllegalStateException(); + } + + out.writeVInt((int) totBytes); + + // nocommit treat int[] ords as long int[] contatenated + // from all docs? + for(BytesRef v : values) { + out.writeBytes(v.bytes, v.offset, v.length); + } + + // nocommit make FASTEST controllable + PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), PackedInts.FASTEST); + + int address = 0; + for(BytesRef v : values) { + w.add(address); + address += v.length; + } + w.add(address); + w.finish(); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields"); + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + out.writeVInt(-1); // write EOF marker + success = true; + } finally { + if (success) { + IOUtils.close(out); + } else { + IOUtils.closeWhileHandlingException(out); + } + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/FacetsDocValuesConsumer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/package.html =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/package.html (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/package.html (working copy) @@ -0,0 +1,25 @@ + + + + + + + +DocValuesFormat that's optimized for facets. + + Property changes on: lucene/facet/src/java/org/apache/lucene/facet/codecs/facetsdv/package.html ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java (revision 1444349) +++ lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java (working copy) @@ -2,6 +2,7 @@ import java.io.IOException; +import org.apache.lucene.facet.codecs.facetsdv.FacetsBinaryDocValues; import org.apache.lucene.facet.encoding.DGapVInt8IntDecoder; import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder; import org.apache.lucene.facet.params.CategoryListParams; @@ -10,6 +11,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -68,20 +70,21 @@ if (dv == null) { // this reader does not have DocValues for the requested category list return; } - + final int length = matchingDocs.bits.length(); final int[] counts = facetArrays.getIntArray(); - int doc = 0; - while (doc < length && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { - dv.get(doc, buf); - if (buf.length > 0) { - // this document has facets - final int upto = buf.offset + buf.length; + + if (dv instanceof FacetsBinaryDocValues) { + final byte[] bytes = ((FacetsBinaryDocValues) dv).bytes; + final PackedInts.Reader addresses = ((FacetsBinaryDocValues) dv).addresses; + int doc = 0; + while (doc < length && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + int offset = (int) addresses.get(doc); + final int end = (int) addresses.get(1+doc); int ord = 0; - int offset = buf.offset; int prev = 0; - while (offset < upto) { - byte b = buf.bytes[offset++]; + while (offset < end) { + byte b = bytes[offset++]; if (b >= 0) { prev = ord = ((ord << 7) | b) + prev; ++counts[ord]; @@ -90,8 +93,32 @@ ord = (ord << 7) | (b & 0x7F); } } + ++doc; } - ++doc; + } else { + + int doc = 0; + while (doc < length && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + dv.get(doc, buf); + if (buf.length > 0) { + // this document has facets + final int upto = buf.offset + buf.length; + int ord = 0; + int offset = buf.offset; + int prev = 0; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + ++counts[ord]; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + } + ++doc; + } } } Index: lucene/facet/src/java/org/apache/lucene/facet/index/Facets42Codec.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/index/Facets42Codec.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/index/Facets42Codec.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.facet.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.lucene42.Lucene42Codec; + +/** Same as {@link Lucene42Codec} except it uses {@link + * FacetsDocValuesFormat} for facet fields + * (faster-but-more-RAM-consuming doc values). */ + +public class Facets42Codec extends Lucene42Codec { + private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facets"); + private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + System.out.println("FIELD: " + field); + // nocommit what about multiple/custom CLPs? + if (field.equals("$facets") || field.equals("$all")) { + return facetsDVFormat; + } else { + return lucene42DVFormat; + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/index/Facets42Codec.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property