Index: contrib/queries/src/java/org/apache/lucene/search/trie/AbstractTrieRangeFilter.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/AbstractTrieRangeFilter.java (revision 761725)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/AbstractTrieRangeFilter.java (working copy)
@@ -18,7 +18,6 @@
*/
import java.io.IOException;
-import java.util.Arrays;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
@@ -33,10 +32,10 @@
abstract class AbstractTrieRangeFilter extends Filter {
- AbstractTrieRangeFilter(final String[] fields, final int precisionStep,
+ AbstractTrieRangeFilter(final String field, final int precisionStep,
Number min, Number max, final boolean minInclusive, final boolean maxInclusive
) {
- this.fields=(String[])fields.clone();
+ this.field=field.intern();
this.precisionStep=precisionStep;
this.min=min;
this.max=max;
@@ -51,7 +50,7 @@
public String toString(final String field) {
final StringBuffer sb=new StringBuffer();
- if (!this.fields[0].equals(field)) sb.append(this.fields[0]).append(':');
+ if (!this.field.equals(field)) sb.append(this.field).append(':');
return sb.append(minInclusive ? '[' : '{')
.append((min==null) ? "*" : min.toString())
.append(" TO ")
@@ -66,7 +65,7 @@
if (this.getClass().equals(o.getClass())) {
AbstractTrieRangeFilter q=(AbstractTrieRangeFilter)o;
return (
- Arrays.equals(fields,q.fields) &&
+ field==q.field &&
(q.min == null ? min == null : q.min.equals(min)) &&
(q.max == null ? max == null : q.max.equals(max)) &&
minInclusive==q.minInclusive &&
@@ -79,7 +78,7 @@
//@Override
public final int hashCode() {
- int hash=Arrays.asList(fields).hashCode()+(precisionStep^0x64365465);
+ int hash = field.hashCode() + (precisionStep^0x64365465);
if (min!=null) hash += min.hashCode()^0x14fa55fb;
if (max!=null) hash += max.hashCode()^0x733fa5fe;
return hash+
@@ -123,12 +122,10 @@
void fillBits(
final IndexReader reader,
final OpenBitSet bits, final TermDocs termDocs,
- String field,
final String lowerTerm, final String upperTerm
) throws IOException {
final int len=lowerTerm.length();
assert upperTerm.length()==len;
- field=field.intern();
// find the docs
final TermEnum enumerator = reader.terms(new Term(field, lowerTerm));
@@ -151,7 +148,7 @@
}
// members
- final String[] fields;
+ final String field;
final int precisionStep;
final Number min,max;
final boolean minInclusive,maxInclusive;
Index: contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieRangeFilter.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieRangeFilter.java (revision 761725)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieRangeFilter.java (working copy)
@@ -30,7 +30,7 @@
/**
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for ints/floats.
* This filter depends on a specific structure of terms in the index that can only be created
- * by {@link TrieUtils} methods.
+ * by indexing via {@link IntTrieTokenStream} methods.
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
*/
public class IntTrieRangeFilter extends AbstractTrieRangeFilter {
@@ -43,52 +43,14 @@
* You can leave the bounds open, by supplying null for min and/or
* max. Inclusive/exclusive bounds can also be supplied.
* To query float values use the converter {@link TrieUtils#floatToSortableInt}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}. - *
This is the recommended usage of TrieUtils/IntTrieRangeFilter.
*/
public IntTrieRangeFilter(final String field, final int precisionStep,
final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
) {
- this(
- new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
- precisionStep,min,max,minInclusive,maxInclusive
- );
+ super(field,precisionStep,min,max,minInclusive,maxInclusive);
}
-
- /**
- * Expert: A trie filter for matching trie coded values using the given field names.
- * You can specify the main and helper field name, that was used to idex the values.
- * precisionStep must me equal or a multiple of the precisionStep
- * used for indexing the values.
- * You can leave the bounds open, by supplying null for min and/or
- * max. Inclusive/exclusive bounds can also be supplied.
- * To query float values use the converter {@link TrieUtils#floatToSortableInt}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
- */
- public IntTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
- final Integer min, final Integer max, final boolean minInclusive, final boolean maxInclusive
- ) {
- this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
- }
/**
- * Expert: A trie filter for matching trie coded values
- * using the given field names. If the array of field names is shorter than the
- * trieCoded one, all trieCoded values with higher index get the last field name.
- * precisionStep must me equal or a multiple of the precisionStep
- * used for indexing the values.
- * You can leave the bounds open, by supplying null for min and/or
- * max. Inclusive/exclusive bounds can also be supplied.
- * To query float values use the converter {@link TrieUtils#floatToSortableInt}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
- */
- public IntTrieRangeFilter(final String[] fields, final int precisionStep,
- Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive
- ) {
- super(fields, precisionStep, min, max, minInclusive, maxInclusive);
- }
-
- /**
* Returns a DocIdSet that provides the documents which should be permitted or prohibited in search results.
*/
//@Override
@@ -112,11 +74,10 @@
TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() {
//@Override
- public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
+ public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
try {
fillBits(
reader, bits, termDocs,
- fields[Math.min(fields.length-1, level)],
minPrefixCoded, maxPrefixCoded
);
} catch (IOException ioe) {
Index: contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieTokenStream.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieTokenStream.java (revision 0)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/IntTrieTokenStream.java (revision 0)
@@ -0,0 +1,168 @@
+package org.apache.lucene.search.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * This class provides a {@link TokenStream} for indexing int values
+ * that can be queried by {@link IntTrieRangeFilter}. This stream is not intended
+ * to be used in analyzers, its more for iterating the different precisions during
+ * indexing a specific numeric value.
+ *
A int value is indexed as multiple string encoded terms, each reduced
+ * by zeroing bits from the right. Each value is also prefixed (in the first char) by the
+ * shift value (number of bits removed) used during encoding.
+ *
The number of bits removed from the right for each trie entry is called
+ * precisionStep in this API. For comparing the different step values, see the
+ * {@linkplain org.apache.lucene.search.trie package description}.
+ *
The usage pattern is (it is recommened to switch off norms and term frequencies + * for numeric fields; it does not make sense to have them): + *
+ * Field field = new Field(name, new IntTrieTokenStream(value, precisionStep)); + * field.setOmitNorms(true); + * field.setOmitTermFreqAndPositions(true); + * document.add(field); + *+ *
For optimal performance, re-use the TokenStream and Field instance + * for more than one document: + *
+ * // init + * TokenStream stream = new IntTrieTokenStream(precisionStep); + * Field field = new Field(name, stream); + * field.setOmitNorms(true); + * field.setOmitTermFreqAndPositions(true); + * // use this code to index many documents: + * stream.setValue(value1) + * document.add(field); + * writer.addDocument(document); + * stream.setValue(value2) + * document.add(field); + * writer.addDocument(document); + * ... + *+ *
Please note: Token streams are read, when the document is added to index. + * If you index more than one numeric field, use a separate instance for each. + *
For more information, how trie fields work, see the
+ * {@linkplain org.apache.lucene.search.trie package description}.
+ */
+public class IntTrieTokenStream extends TokenStream {
+
+ /** The full precision field gets this token type assigned. */
+ public static final String TOKEN_TYPE_FULL_PREC = "fullPrecTrieInt";
+
+ /** The lower precision fields gets this token type assigned. */
+ public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecTrieInt";
+
+ /**
+ * Creates a token stream for indexing value with the given
+ * precisionStep. As instance creating is a major cost,
+ * consider using a {@link #IntTrieTokenStream(int)} instance once for
+ * indexing a large number of documents and assign a value with
+ * {@link #setValue} for each document.
+ * To index float values use the converter {@link TrieUtils#doubleToSortableLong}.
+ */
+ public IntTrieTokenStream(final int value, final int precisionStep) {
+ if (precisionStep<1 || precisionStep>32)
+ throw new IllegalArgumentException("precisionStep may only be 1..32");
+ this.value = value;
+ this.precisionStep = precisionStep;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
+
+ /**
+ * Creates a token stream for indexing values with the given
+ * precisionStep. This stream is initially "empty"
+ * (using a numeric value of 0), assign a value before indexing
+ * each document using {@link #setValue}.
+ */
+ public IntTrieTokenStream(final int precisionStep) {
+ this(0, precisionStep);
+ }
+
+ /**
+ * Resets the token stream to deliver prefix encoded values
+ * for value. Use this method to index the same
+ * numeric field for a large number of documents and reuse the
+ * current stream instance.
+ * To index float values use the converter {@link TrieUtils#doubleToSortableLong}.
+ */
+ public void setValue(final int value) {
+ this.value = value;
+ reset();
+ }
+
+ // @Override
+ public void reset() {
+ shift = 0;
+ }
+
+ // @Override
+ public boolean incrementToken() {
+ if (shift>=32) return false;
+ final char[] buffer = termAtt.resizeTermBuffer(TrieUtils.INT_BUF_SIZE);
+ termAtt.setTermLength(TrieUtils.intToPrefixCoded(value, shift, buffer));
+ if (shift==0) {
+ typeAtt.setType(TOKEN_TYPE_FULL_PREC);
+ posIncrAtt.setPositionIncrement(1);
+ } else {
+ typeAtt.setType(TOKEN_TYPE_LOWER_PREC);
+ posIncrAtt.setPositionIncrement(0);
+ }
+ shift += precisionStep;
+ return true;
+ }
+
+ // @Override
+ /** @deprecated */
+ public Token next(final Token reusableToken) {
+ if (shift>=32) return null;
+ final char[] buffer = reusableToken.resizeTermBuffer(TrieUtils.INT_BUF_SIZE);
+ reusableToken.setTermLength(TrieUtils.intToPrefixCoded(value, shift, buffer));
+ if (shift==0) {
+ reusableToken.setType(TOKEN_TYPE_FULL_PREC);
+ reusableToken.setPositionIncrement(1);
+ } else {
+ reusableToken.setType(TOKEN_TYPE_LOWER_PREC);
+ reusableToken.setPositionIncrement(0);
+ }
+ shift += precisionStep;
+ return reusableToken;
+ }
+
+ // @Override
+ public String toString() {
+ final StringBuffer sb = new StringBuffer("(trie-int,value=").append(value);
+ sb.append(",precisionStep=").append(precisionStep).append(')');
+ return sb.toString();
+ }
+
+ // members
+ private final TermAttribute termAtt;
+ private final TypeAttribute typeAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+
+ private int shift = 0;
+ private int value;
+ private final int precisionStep;
+}
Index: contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieRangeFilter.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieRangeFilter.java (revision 761725)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieRangeFilter.java (working copy)
@@ -30,7 +30,7 @@
/**
* Implementation of a Lucene {@link Filter} that implements trie-based range filtering for longs/doubles.
* This filter depends on a specific structure of terms in the index that can only be created
- * by {@link TrieUtils} methods.
+ * by indexing via {@link LongTrieTokenStream} methods.
* For more information, how the algorithm works, see the {@linkplain org.apache.lucene.search.trie package description}.
*/
public class LongTrieRangeFilter extends AbstractTrieRangeFilter {
@@ -43,52 +43,14 @@
* You can leave the bounds open, by supplying null for min and/or
* max. Inclusive/exclusive bounds can also be supplied.
* To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String[])}. - *
This is the recommended usage of TrieUtils/LongTrieRangeFilter.
*/
public LongTrieRangeFilter(final String field, final int precisionStep,
final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
) {
- this(
- new String[]{field, field+TrieUtils.LOWER_PRECISION_FIELD_NAME_SUFFIX},
- precisionStep,min,max,minInclusive,maxInclusive
- );
+ super(field,precisionStep,min,max,minInclusive,maxInclusive);
}
-
- /**
- * Expert: A trie filter for matching trie coded values using the given field names.
- * You can specify the main and helper field name, that was used to idex the values.
- * precisionStep must me equal or a multiple of the precisionStep
- * used for indexing the values.
- * You can leave the bounds open, by supplying null for min and/or
- * max. Inclusive/exclusive bounds can also be supplied.
- * To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String,String,String[])}.
- */
- public LongTrieRangeFilter(final String field, final String lowerPrecisionField, final int precisionStep,
- final Long min, final Long max, final boolean minInclusive, final boolean maxInclusive
- ) {
- this(new String[]{field, lowerPrecisionField},precisionStep,min,max,minInclusive,maxInclusive);
- }
/**
- * Expert: A trie filter for matching trie coded values
- * using the given field names. If the array of field names is shorter than the
- * trieCoded one, all trieCoded values with higher index get the last field name.
- * precisionStep must me equal or a multiple of the precisionStep
- * used for indexing the values.
- * You can leave the bounds open, by supplying null for min and/or
- * max. Inclusive/exclusive bounds can also be supplied.
- * To query double values use the converter {@link TrieUtils#doubleToSortableLong}.
- *
This is the counterpart to {@link TrieUtils#addIndexedFields(Document,String[],String[])}.
- */
- public LongTrieRangeFilter(final String[] fields, final int precisionStep,
- Long min, Long max, final boolean minInclusive, final boolean maxInclusive
- ) {
- super(fields, precisionStep, min, max, minInclusive, maxInclusive);
- }
-
- /**
* Returns a DocIdSet that provides the documents which should be permitted or prohibited in search results.
*/
//@Override
@@ -112,11 +74,10 @@
TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() {
//@Override
- public final void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
+ public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
try {
fillBits(
reader, bits, termDocs,
- fields[Math.min(fields.length-1, level)],
minPrefixCoded, maxPrefixCoded
);
} catch (IOException ioe) {
Index: contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieTokenStream.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieTokenStream.java (revision 0)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/LongTrieTokenStream.java (revision 0)
@@ -0,0 +1,168 @@
+package org.apache.lucene.search.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * This class provides a {@link TokenStream} for indexing long values
+ * that can be queried by {@link LongTrieRangeFilter}. This stream is not intended
+ * to be used in analyzers, its more for iterating the different precisions during
+ * indexing a specific numeric value.
+ *
A long value is indexed as multiple string encoded terms, each reduced
+ * by zeroing bits from the right. Each value is also prefixed (in the first char) by the
+ * shift value (number of bits removed) used during encoding.
+ *
The number of bits removed from the right for each trie entry is called
+ * precisionStep in this API. For comparing the different step values, see the
+ * {@linkplain org.apache.lucene.search.trie package description}.
+ *
The usage pattern is (it is recommened to switch off norms and term frequencies + * for numeric fields; it does not make sense to have them): + *
+ * Field field = new Field(name, new LongTrieTokenStream(value, precisionStep)); + * field.setOmitNorms(true); + * field.setOmitTermFreqAndPositions(true); + * document.add(field); + *+ *
For optimal performance, re-use the TokenStream and Field instance + * for more than one document: + *
+ * // init + * TokenStream stream = new LongTrieTokenStream(precisionStep); + * Field field = new Field(name, stream); + * field.setOmitNorms(true); + * field.setOmitTermFreqAndPositions(true); + * // use this code to index many documents: + * stream.setValue(value1) + * document.add(field); + * writer.addDocument(document); + * stream.setValue(value2) + * document.add(field); + * writer.addDocument(document); + * ... + *+ *
Please note: Token streams are read, when the document is added to index. + * If you index more than one numeric field, use a separate instance for each. + *
For more information, how trie fields work, see the
+ * {@linkplain org.apache.lucene.search.trie package description}.
+ */
+public class LongTrieTokenStream extends TokenStream {
+
+ /** The full precision field gets this token type assigned. */
+ public static final String TOKEN_TYPE_FULL_PREC = "fullPrecTrieLong";
+
+ /** The lower precision fields gets this token type assigned. */
+ public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecTrieLong";
+
+ /**
+ * Creates a token stream for indexing value with the given
+ * precisionStep. As instance creating is a major cost,
+ * consider using a {@link #LongTrieTokenStream(int)} instance once for
+ * indexing a large number of documents and assign a value with
+ * {@link #setValue} for each document.
+ * To index double values use the converter {@link TrieUtils#doubleToSortableLong}.
+ */
+ public LongTrieTokenStream(final long value, final int precisionStep) {
+ if (precisionStep<1 || precisionStep>64)
+ throw new IllegalArgumentException("precisionStep may only be 1..64");
+ this.value = value;
+ this.precisionStep = precisionStep;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
+
+ /**
+ * Creates a token stream for indexing values with the given
+ * precisionStep. This stream is initially "empty"
+ * (using a numeric value of 0), assign a value before indexing
+ * each document using {@link #setValue}.
+ */
+ public LongTrieTokenStream(final int precisionStep) {
+ this(0L, precisionStep);
+ }
+
+ /**
+ * Resets the token stream to deliver prefix encoded values
+ * for value. Use this method to index the same
+ * numeric field for a large number of documents and reuse the
+ * current stream instance.
+ * To index double values use the converter {@link TrieUtils#doubleToSortableLong}.
+ */
+ public void setValue(final long value) {
+ this.value = value;
+ reset();
+ }
+
+ // @Override
+ public void reset() {
+ shift = 0;
+ }
+
+ // @Override
+ public boolean incrementToken() {
+ if (shift>=64) return false;
+ final char[] buffer = termAtt.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE);
+ termAtt.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer));
+ if (shift==0) {
+ typeAtt.setType(TOKEN_TYPE_FULL_PREC);
+ posIncrAtt.setPositionIncrement(1);
+ } else {
+ typeAtt.setType(TOKEN_TYPE_LOWER_PREC);
+ posIncrAtt.setPositionIncrement(0);
+ }
+ shift += precisionStep;
+ return true;
+ }
+
+ // @Override
+ /** @deprecated */
+ public Token next(final Token reusableToken) {
+ if (shift>=64) return null;
+ final char[] buffer = reusableToken.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE);
+ reusableToken.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer));
+ if (shift==0) {
+ reusableToken.setType(TOKEN_TYPE_FULL_PREC);
+ reusableToken.setPositionIncrement(1);
+ } else {
+ reusableToken.setType(TOKEN_TYPE_LOWER_PREC);
+ reusableToken.setPositionIncrement(0);
+ }
+ shift += precisionStep;
+ return reusableToken;
+ }
+
+ // @Override
+ public String toString() {
+ final StringBuffer sb = new StringBuffer("(trie-long,value=").append(value);
+ sb.append(",precisionStep=").append(precisionStep).append(')');
+ return sb.toString();
+ }
+
+ // members
+ private final TermAttribute termAtt;
+ private final TypeAttribute typeAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+
+ private int shift = 0;
+ private long value;
+ private final int precisionStep;
+}
Index: contrib/queries/src/java/org/apache/lucene/search/trie/package.html
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/trie/package.html (revision 761725)
+++ contrib/queries/src/java/org/apache/lucene/search/trie/package.html (working copy)
@@ -50,10 +50,14 @@
are no longer dependent on the index size and the number of distinct values because there is
an upper limit unrelated to either of these properties.
To use the new query types the numerical values, which may belong, double, int,
float, or Date, the values must be indexed in a special prefix encoded format
-(using {@link org.apache.lucene.search.trie.TrieUtils}). This can be done like this:
// chose a step value, 8 is a general good value for large indexes:
@@ -67,15 +71,25 @@
// add some numerical fields:
long lvalue = 121345L;
- TrieUtils.addIndexedFields(doc, "exampleLong", TrieUtils.trieCodeLong(lvalue, precisionStep));
+ Field f = new Field("exampleLong", new LongTrieTokenStream(lvalue, precisionStep));
+ f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
+ doc.add(f);
double dvalue = 1.057E17;
- TrieUtils.addIndexedFields(doc, "exampleDouble", TrieUtils.trieCodeLong(TrieUtils.doubleToSortableLong(dvalue), precisionStep));
+ f = new Field("exampleDouble", new LongTrieTokenStream(TrieUtils.doubleToSortableLong(dvalue), precisionStep));
+ f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
+ doc.add(f);
int ivalue = 121345;
- TrieUtils.addIndexedFields(doc, "exampleInt", TrieUtils.trieCodeInt(ivalue, precisionStep));
+ f = new Field("exampleInt", new IntTrieTokenStream(ivalue, precisionStep));
+ f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
+ doc.add(f);
float fvalue = 1.057E17f;
- TrieUtils.addIndexedFields(doc, "exampleFloat", TrieUtils.trieCodeInt(TrieUtils.floatToSortableInt(fvalue), precisionStep));
+ f = new Field("exampleFloat", new IntTrieTokenStream(TrieUtils.floatToSortableInt(fvalue), precisionStep));
+ f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
+ doc.add(f);
Date datevalue = new Date(); // actual time
- TrieUtils.addIndexedFields(doc, "exampleDate", TrieUtils.trieCodeLong(datevalue.getTime(), precisionStep));
+ f = new Field("exampleDate", new LongTrieTokenStream(datevalue.getTime(), precisionStep));
+ f.setOmitNorms(true); f.setOmitTermFreqAndPositions(true);
+ doc.add(f);
// if you want to also store one of the values:
doc.add(new Field("exampleLong", Long.toString(lvalue), Field.Store.YES, Field.Index.NO));
@@ -86,6 +100,11 @@
// now add document to IndexWriter, as usual
+(for higher indexing performance, you can reuse the TokenStreams – +more info about this in the stream documentation)
+ +The numeric index fields you prepared in this way can be searched by {@link org.apache.lucene.search.trie.LongTrieRangeFilter} or {@link org.apache.lucene.search.trie.IntTrieRangeFilter}:
Index: contrib/queries/src/java/org/apache/lucene/search/trie/TrieUtils.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/trie/TrieUtils.java (revision 761725) +++ contrib/queries/src/java/org/apache/lucene/search/trie/TrieUtils.java (working copy) @@ -17,17 +17,13 @@ * limitations under the License. */ -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.search.SortField; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.ExtendedFieldCache; /** - * This is a helper class to construct the trie-based index entries for numerical values. - * For more information on how the algorithm works, see the - * {@linkplain org.apache.lucene.search.trie package description}. - *To quickly execute range queries in Apache Lucene, a range is divided recursively * into multiple intervals for searching: The center of the range is searched only with * the lowest possible precision in the trie, while the boundaries are matched @@ -35,54 +31,48 @@ *
This class generates terms to achive this: First the numerical integer values need to
* be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
* and the bits are converted to ASCII chars with each 7 bit. The resulting string is
- * sortable like the original integer value.
+ * sortable like the original integer value. Each value is also prefixed
+ * (in the first char) by the shift value (number of bits removed) used
+ * during encoding.
*
To also index floating point numbers, this class supplies two methods to convert them * to integer values by changing their bit layout: {@link #doubleToSortableLong}, * {@link #floatToSortableInt}. You will have no precision loss by * converting floating point numbers to integers and back (only that the integer form * is not usable). Other data types like dates can easily converted to longs or ints (e.g. * date to long: {@link java.util.Date#getTime}). - *
To index the different precisions of the long values each encoded value is also reduced
- * by zeroing bits from the right. Each value is also prefixed (in the first char) by the
- * shift value (number of bits removed) used during encoding. This series of
- * different precision values can be indexed into a Lucene {@link Document} using
- * {@link #addIndexedFields(Document,String,String[])}. The default is to index the original
- * precision in the supplied field name and the lower precisions in an additional helper field.
- * Because of this, the full-precision field can also be sorted (using {@link #getLongSortField}
- * or {@link #getIntSortField}).
- *
The number of bits removed from the right for each trie entry is called
- * precisionStep in this API. For comparing the different step values, see the
- * {@linkplain org.apache.lucene.search.trie package description}.
+ *
Prefix encoded fields can also be sorted using the {@link SortField} factories
+ * {@link #getLongSortField} or {@link #getIntSortField}.
*/
public final class TrieUtils {
private TrieUtils() {} // no instance!
/**
- * The default "helper" field containing the lower precision terms is the original
- * fieldname with this appended. This suffix is used in
- * {@link #addIndexedFields(Document,String,String[])} and the corresponding c'tor
- * of To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
- */
- public static String[] trieCodeLong(long val, int precisionStep) {
- if (precisionStep<1 || precisionStep>64)
- throw new IllegalArgumentException("precisionStep may only be 1..64");
- String[] arr = new String[63/precisionStep+1];
- int idx = 0;
- for (int shift=0; shift<64; shift+=precisionStep) {
- arr[idx++] = longToPrefixCoded(val, shift);
- }
- return arr;
- }
-
- /**
- * Returns a sequence of trie coded numbers suitable for {@link IntTrieRangeFilter}.
- * Each successive string in the list has had it's precision reduced by To achieve this, use {@link #addIndexedFields(Document,String,String[])}.
- */
- public static String[] trieCodeInt(int val, int precisionStep) {
- if (precisionStep<1 || precisionStep>32)
- throw new IllegalArgumentException("precisionStep may only be 1..32");
- String[] arr = new String[31/precisionStep+1];
- int idx = 0;
- for (int shift=0; shift<32; shift+=precisionStep) {
- arr[idx++] = intToPrefixCoded(val, shift);
- }
- return arr;
- }
-
- /**
- * Indexes the full precision value only in the main field (for sorting), and indexes all other
- * lower precision values in This is the recommended variant to add trie fields to the index.
- * By this it is possible to sort the field using a This method does not store the fields and saves no term frequency or norms
- * (which are normally not needed for trie fields). If you want to additionally store
- * the value, you can use the normal methods of {@link Document} to achive this, just specify
- * Examples:
- * This method does not store the fields and saves no term frequency or norms
- * (which are normally not needed for trie fields). If you want to additionally store
- * the value, you can use the normal methods of {@link Document} to achive this, just specify
- * Examples:
- * This method does not store the fields and saves no term frequency or norms
- * (which are normally not needed for trie fields). If you want to additionally store
- * the value, you can use the normal methods of {@link Document} to achive this, just specify
- * This method is used by {@link IntTrieRangeFilter}.
*/
@@ -419,7 +336,7 @@
final Object builder, final int valSize,
final int precisionStep, long minBound, long maxBound
) {
- for (int level=0,shift=0;; level++) {
+ for (int shift=0; ; shift += precisionStep) {
// calculate new bounds for inner precision
final long diff = 1L << (shift+precisionStep),
mask = ((1L< DO NOT USE IN YOUR OWN CODE! It is just public
+ * to be accessible from o.a.l.search.trie.
+ */
+ public static class StopFillCacheException extends RuntimeException {
+ }
+
/** Interface to parse bytes from document fields.
* @see FieldCache#getBytes(IndexReader, String, FieldCache.ByteParser)
*/
Index: src/java/org/apache/lucene/search/FieldCacheImpl.java
===================================================================
--- src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 761725)
+++ src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy)
@@ -196,6 +196,7 @@
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
+ } catch (StopFillCacheException stop) {
} finally {
termDocs.close();
termEnum.close();
@@ -235,6 +236,7 @@
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
+ } catch (StopFillCacheException stop) {
} finally {
termDocs.close();
termEnum.close();
@@ -274,6 +276,7 @@
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
+ } catch (StopFillCacheException stop) {
} finally {
termDocs.close();
termEnum.close();
@@ -315,6 +318,7 @@
retArray[termDocs.doc()] = termval;
}
} while (termEnum.next());
+ } catch (StopFillCacheException stop) {
} finally {
termDocs.close();
termEnum.close();
(Long|Int)TrieRangeFilter.
- */
- public static final String LOWER_PRECISION_FIELD_NAME_SUFFIX="#trie";
-
- /**
* Longs are stored at lower precision by shifting off lower bits. The shift count is
* stored as SHIFT_START_LONG+shift in the first character
*/
public static final char SHIFT_START_LONG = (char)0x20;
+ /** internal: maximum needed char[] buffer size for encoding */
+ static final int LONG_BUF_SIZE = 63/7 + 2;
+
/**
* Integers are stored at lower precision by shifting off lower bits. The shift count is
* stored as SHIFT_START_INT+shift in the first character
*/
public static final char SHIFT_START_INT = (char)0x60;
+ /** internal: maximum needed char[] buffer size for encoding */
+ static final int INT_BUF_SIZE = 31/7 + 2;
+
/**
* A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as longs.
*/
public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){
public final long parseLong(final String val) {
+ final int shift = val.charAt(0)-SHIFT_START_LONG;
+ if (shift>0 && shift<=63)
+ throw new FieldCache.StopFillCacheException();
return prefixCodedToLong(val);
}
};
@@ -92,6 +82,9 @@
*/
public static final FieldCache.IntParser FIELD_CACHE_INT_PARSER=new FieldCache.IntParser(){
public final int parseInt(final String val) {
+ final int shift = val.charAt(0)-SHIFT_START_INT;
+ if (shift>0 && shift<=31)
+ throw new FieldCache.StopFillCacheException();
return prefixCodedToInt(val);
}
};
@@ -102,6 +95,9 @@
*/
public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){
public final double parseDouble(final String val) {
+ final int shift = val.charAt(0)-SHIFT_START_LONG;
+ if (shift>0 && shift<=63)
+ throw new FieldCache.StopFillCacheException();
return sortableLongToDouble(prefixCodedToLong(val));
}
};
@@ -112,9 +108,28 @@
*/
public static final FieldCache.FloatParser FIELD_CACHE_FLOAT_PARSER=new FieldCache.FloatParser(){
public final float parseFloat(final String val) {
+ final int shift = val.charAt(0)-SHIFT_START_INT;
+ if (shift>0 && shift<=31)
+ throw new FieldCache.StopFillCacheException();
return sortableIntToFloat(prefixCodedToInt(val));
}
};
+
+ /** internal */
+ static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
+ int nChars = (63-shift)/7 + 1, len = nChars+1;
+ buffer[0] = (char)(SHIFT_START_LONG + shift);
+ long sortableBits = val ^ 0x8000000000000000L;
+ sortableBits >>>= shift;
+ while (nChars>=1) {
+ // Store 7 bits per character for good efficiency when UTF-8 encoding.
+ // The whole number is right-justified so that lucene can prefix-encode
+ // the terms more efficiently.
+ buffer[nChars--] = (char)(sortableBits & 0x7f);
+ sortableBits >>>= 7;
+ }
+ return len;
+ }
/**
* This is a convenience method, that returns prefix coded bits of a long without
@@ -125,27 +140,33 @@
public static String longToPrefixCoded(final long val) {
return longToPrefixCoded(val, 0);
}
-
+
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
- * This is method is used by {@link #trieCodeLong}.
+ * This is method is used by {@link LongRangeBuilder}.
*/
public static String longToPrefixCoded(final long val, final int shift) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
- int nChars = (63-shift)/7 + 1;
- final char[] arr = new char[nChars+1];
- arr[0] = (char)(SHIFT_START_LONG + shift);
- long sortableBits = val ^ 0x8000000000000000L;
+ final char[] buffer = new char[LONG_BUF_SIZE];
+ final int len = longToPrefixCoded(val, shift, buffer);
+ return new String(buffer, 0, len);
+ }
+
+ /** internal */
+ static int intToPrefixCoded(final int val, final int shift, final char[] buffer) {
+ int nChars = (31-shift)/7 + 1, len = nChars+1;
+ buffer[0] = (char)(SHIFT_START_INT + shift);
+ int sortableBits = val ^ 0x80000000;
sortableBits >>>= shift;
while (nChars>=1) {
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
- arr[nChars--] = (char)(sortableBits & 0x7f);
+ buffer[nChars--] = (char)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
- return new String(arr);
+ return len;
}
/**
@@ -160,39 +181,30 @@
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
- * This is method is used by {@link #trieCodeInt}.
+ * This is method is used by {@link IntRangeBuilder}.
*/
public static String intToPrefixCoded(final int val, final int shift) {
if (shift>31 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
- int nChars = (31-shift)/7 + 1;
- final char[] arr = new char[nChars+1];
- arr[0] = (char)(SHIFT_START_INT + shift);
- int sortableBits = val ^ 0x80000000;
- sortableBits >>>= shift;
- while (nChars>=1) {
- // Store 7 bits per character for good efficiency when UTF-8 encoding.
- // The whole number is right-justified so that lucene can prefix-encode
- // the terms more efficiently.
- arr[nChars--] = (char)(sortableBits & 0x7f);
- sortableBits >>>= 7;
- }
- return new String(arr);
+ final char[] buffer = new char[INT_BUF_SIZE];
+ final int len = intToPrefixCoded(val, shift, buffer);
+ return new String(buffer, 0, len);
}
/**
* Returns a long from prefixCoded characters.
* Rightmost bits will be zero for lower precision codes.
* This method can be used to decode e.g. a stored field.
+ * @throws NumberFormatException if the supplied char sequence is
+ * not correctly prefix encoded.
* @see #longToPrefixCoded(long)
*/
- public static long prefixCodedToLong(final String prefixCoded) {
- final int len = prefixCoded.length();
+ public static long prefixCodedToLong(final CharSequence prefixCoded) {
final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
if (shift>63 || shift<0)
- throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
+ throw new NumberFormatException("Invalid shift value in prefixCoded char sequence (is encoded value really a LONG?)");
long sortableBits = 0L;
- for (int i=1; iprecisionStep.
- * For sorting, index the first full-precision value into a separate field and the
- * remaining values into another field.
- * precisionStep.
- * For sorting, index the first full-precision value into a separate field and the
- * remaining values into another field.
- * field+LOWER_PRECISION_FIELD_NAME_SUFFIX.
- * SortField instance
- * returned by {@link #getLongSortField} or {@link #getIntSortField}.
- * Field.Store.YES, Field.Index.NO and the same field name.
- *
- * addIndexedFields(doc, "mydouble", trieCodeLong(doubleToSortableLong(1.414d), 4));
- * addIndexedFields(doc, "mylong", trieCodeLong(123456L, 4));
- *
- **/
- public static void addIndexedFields(Document doc, String field, String[] trieCoded) {
- addIndexedFields(doc, new String[]{field, field+LOWER_PRECISION_FIELD_NAME_SUFFIX}, trieCoded);
- }
-
- /**
- * Expert: Indexes the full precision value only in the main field (for sorting), and indexes all other
- * lower precision values in the lowerPrecision field.
- * If you do not specify the same field name for the main and lower precision one,
- * it is possible to sort the field using a SortField instance
- * returned by {@link #getLongSortField} or {@link #getIntSortField}.
- * Field.Store.YES, Field.Index.NO and the same main field name.
- *
- * addIndexedFields(doc, "mydouble", "mydoubletrie", trieCodeLong(doubleToSortableLong(1.414d), 4));
- * addIndexedFields(doc, "mylong", "mylongtrie", trieCodeLong(123456L, 4));
- *
- * @see #addIndexedFields(Document,String,String[])
- **/
- public static void addIndexedFields(Document doc, String field, String lowerPrecisionField, String[] trieCoded) {
- addIndexedFields(doc, new String[]{field, lowerPrecisionField}, trieCoded);
- }
-
- /**
- * Expert: Indexes a series of trie coded values into a lucene {@link Document}
- * using the given field names.
- * If the array of field names is shorter than the trie coded one, all trie coded
- * values with higher index get the last field name.
- * Field.Store.YES, Field.Index.NO and the same main field name.
- **/
- public static void addIndexedFields(Document doc, String[] fields, String[] trieCoded) {
- for (int i=0; i
- * String field = fields[Math.min(fields.length-1, level)];
- *
+ * You can directly build classical (inclusive) range queries from them.
*/
- public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
+ public void addRange(String minPrefixCoded, String maxPrefixCoded) {
throw new UnsupportedOperationException();
}
@@ -501,10 +410,8 @@
* Overwrite this method, if you like to receive the raw long range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
- public void addRange(final long min, final long max, final int shift, final int level) {
- /*System.out.println(Long.toHexString((min^0x8000000000000000L) >>> shift)+".."+
- Long.toHexString((max^0x8000000000000000L) >>> shift));*/
- addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift), level);
+ public void addRange(final long min, final long max, final int shift) {
+ addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
}
}
@@ -519,16 +426,9 @@
/**
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
- * You can directly build classical range queries from them.
- * The level gives the precision level (0 = highest precision) of the encoded values.
- * This parameter could be used as an index to an array of fieldnames like the
- * parameters to {@link #addIndexedFields(Document,String[],String[])} for specifying
- * the field names for each precision:
- *
- * String field = fields[Math.min(fields.length-1, level)];
- *
+ * You can directly build classical range (inclusive) queries from them.
*/
- public void addRange(String minPrefixCoded, String maxPrefixCoded, int level) {
+ public void addRange(String minPrefixCoded, String maxPrefixCoded) {
throw new UnsupportedOperationException();
}
@@ -536,10 +436,8 @@
* Overwrite this method, if you like to receive the raw int range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
- public void addRange(final int min, final int max, final int shift, final int level) {
- /*System.out.println(Integer.toHexString((min^0x80000000) >>> shift)+".."+
- Integer.toHexString((max^0x80000000) >>> shift));*/
- addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift), level);
+ public void addRange(final int min, final int max, final int shift) {
+ addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));
}
}
Index: contrib/queries/src/test/org/apache/lucene/search/trie/TestIntTrieRangeFilter.java
===================================================================
--- contrib/queries/src/test/org/apache/lucene/search/trie/TestIntTrieRangeFilter.java (revision 761725)
+++ contrib/queries/src/test/org/apache/lucene/search/trie/TestIntTrieRangeFilter.java (working copy)
@@ -42,6 +42,15 @@
// number of docs to generate for testing
private static final int noDocs = 10000;
+ private static Field newField(String name, int precisionStep) {
+ IntTrieTokenStream stream = new IntTrieTokenStream(precisionStep);
+ stream.setUseNewAPI(true);
+ Field f=new Field(name, stream);
+ f.setOmitTermFreqAndPositions(true);
+ f.setOmitNorms(true);
+ return f;
+ }
+
private static final RAMDirectory directory;
private static final IndexSearcher searcher;
static {
@@ -50,21 +59,34 @@
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
true, MaxFieldLength.UNLIMITED);
+ Field
+ field8 = newField("field8", 8),
+ field4 = newField("field4", 4),
+ field2 = newField("field2", 2),
+ ascfield8 = newField("ascfield8", 8),
+ ascfield4 = newField("ascfield4", 4),
+ ascfield2 = newField("ascfield2", 2);
+
// Add a series of noDocs docs with increasing int values
for (int l=0; l