Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 784883) +++ CHANGES.txt (working copy) @@ -405,8 +405,15 @@ via Mike McCandless) 26. LUCENE-1550: Added new n-gram based String distance measure for spell checking. - See the Javadocs for NGramDistance.java for a reference paper on why this is helpful (Tom Morton via Grant Ingersoll) - + See the Javadocs for NGramDistance.java for a reference paper on why + this is helpful (Tom Morton via Grant Ingersoll) + +27. LUCENE-1470, LUCENE-1582, LUCENE-1602, LUCENE-1673: Added + NumericRangeQuery and NumericRangeFilter, a fast alternative to + RangeQuery/RangeFilter for numeric searches. They depend on a specific + structure of terms in the index that can be created by indexing + using the new NumericTokenStream class. (Uwe Schindler, + Yonik Seeley, Mike McCandless) Optimizations Index: src/java/org/apache/lucene/analysis/NumericTokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/NumericTokenStream.java (revision 0) +++ src/java/org/apache/lucene/analysis/NumericTokenStream.java (revision 0) @@ -0,0 +1,241 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.TrieUtils; +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.analysis.tokenattributes.ShiftAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * This class provides a {@link TokenStream} for indexing numeric values + * that can be used by {@link NumericRangeQuery}/{@link NumericRangeFilter}. + * For more information, how to use this class and its configuration properties + * (precisionStep) read the docs of {@link NumericRangeQuery}. + * + *

This stream is not intended to be used in analyzers, its more for iterating the + * different precisions during indexing a specific numeric value. + * A numeric value is indexed as multiple string encoded terms, each reduced + * by zeroing bits from the right. Each value is also prefixed (in the first char) by the + * shift value (number of bits removed) used during encoding. + * The number of bits removed from the right for each trie entry is called + * precisionStep in this API. + * + *

The usage pattern is (it is recommened to switch off norms and term frequencies + * for numeric fields; it does not make sense to have them): + *

+ *  Field field = new Field(name, new NumericTokenStream(precisionStep).set???Value(value));
+ *  field.setOmitNorms(true);
+ *  field.setOmitTermFreqAndPositions(true);
+ *  document.add(field);
+ * 
+ *

For optimal performance, re-use the TokenStream and Field instance + * for more than one document: + *

+ *  // init
+ *  NumericTokenStream stream = new NumericTokenStream(precisionStep);
+ *  Field field = new Field(name, stream);
+ *  field.setOmitNorms(true);
+ *  field.setOmitTermFreqAndPositions(true);
+ *  Document doc = new Document();
+ *  document.add(field);
+ *  // use this code to index many documents:
+ *  stream.set???Value(value1)
+ *  writer.addDocument(document);
+ *  stream.set???Value(value2)
+ *  writer.addDocument(document);
+ *  ...
+ * 
+ *

Please note: Token streams are read, when the document is added to index. + * If you index more than one numeric field, use a separate instance for each. + * @since 2.9 + */ +public final class NumericTokenStream extends TokenStream { + + /** The full precision 64 bit token gets this token type assigned. */ + public static final String TOKEN_TYPE_FULL_PREC_64 = "fullPrecNumeric64"; + + /** The lower precision 64 bit tokens gets this token type assigned. */ + public static final String TOKEN_TYPE_LOWER_PREC_64 = "lowerPrecNumeric64"; + + /** The full precision 32 bit token gets this token type assigned. */ + public static final String TOKEN_TYPE_FULL_PREC_32 = "fullPrecNumeric32"; + + /** The lower precision 32 bit tokens gets this token type assigned. */ + public static final String TOKEN_TYPE_LOWER_PREC_32 = "lowerPrecNumeric32"; + + /** + * Creates a token stream for numeric values. The stream is not yet initialized, + * before using set a value using the various set???Value() methods. + */ + public NumericTokenStream(final int precisionStep) { + this.precisionStep = precisionStep; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + shiftAtt = (ShiftAttribute) addAttribute(ShiftAttribute.class); + } + + /** + * Initializes the token stream with the supplied long value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setLongValue(value)) + */ + public NumericTokenStream setLongValue(final long value) { + this.value = value; + valSize = 64; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied int value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setIntValue(value)) + */ + public NumericTokenStream setIntValue(final int value) { + this.value = (long) value; + valSize = 32; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied double value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value)) + */ + public NumericTokenStream setDoubleValue(final double value) { + this.value = TrieUtils.doubleToSortableLong(value); + valSize = 64; + shift = 0; + return this; + } + + /** + * Initializes the token stream with the supplied float value. + * @param value the value, for which this TokenStream should enumerate tokens. + * @return this instance, because of this you can use it the following way: + * new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value)) + */ + public NumericTokenStream setFloatValue(final float value) { + this.value = (long) TrieUtils.floatToSortableInt(value); + valSize = 32; + shift = 0; + return this; + } + + // @Override + public void reset() { + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (precisionStep < 1 || precisionStep > valSize) + throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); + shift = 0; + } + + // @Override + public boolean incrementToken() { + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (shift >= valSize) + return false; + + final char[] buffer; + switch (valSize) { + case 64: + buffer = termAtt.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE); + termAtt.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer)); + typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); + break; + + case 32: + buffer = termAtt.resizeTermBuffer(TrieUtils.INT_BUF_SIZE); + termAtt.setTermLength(TrieUtils.intToPrefixCoded((int) value, shift, buffer)); + typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); + break; + + default: + // should not happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + shiftAtt.setShift(shift); + posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); + shift += precisionStep; + return true; + } + + // @Override + /** @deprecated Will be removed in Lucene 3.0 */ + public Token next(final Token reusableToken) { + if (valSize == 0) + throw new IllegalStateException("call set???Value() before usage"); + if (shift >= valSize) + return null; + + reusableToken.clear(); + + final char[] buffer; + switch (valSize) { + case 64: + buffer = reusableToken.resizeTermBuffer(TrieUtils.LONG_BUF_SIZE); + reusableToken.setTermLength(TrieUtils.longToPrefixCoded(value, shift, buffer)); + reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_64 : TOKEN_TYPE_LOWER_PREC_64); + break; + + case 32: + buffer = reusableToken.resizeTermBuffer(TrieUtils.INT_BUF_SIZE); + reusableToken.setTermLength(TrieUtils.intToPrefixCoded((int) value, shift, buffer)); + reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC_32 : TOKEN_TYPE_LOWER_PREC_32); + break; + + default: + // should not happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + reusableToken.setPositionIncrement((shift == 0) ? 1 : 0); + shift += precisionStep; + return reusableToken; + } + + // @Override + public String toString() { + final StringBuffer sb = new StringBuffer("(numeric,valSize=").append(valSize); + sb.append(",precisionStep=").append(precisionStep).append(')'); + return sb.toString(); + } + + // members + private final TermAttribute termAtt; + private final TypeAttribute typeAtt; + private final PositionIncrementAttribute posIncrAtt; + private final ShiftAttribute shiftAtt; + + private int shift = 0, valSize = 0; // valSize==0 means not initialized + private final int precisionStep; + + private long value = 0L; +} Property changes on: src\java\org\apache\lucene\analysis\NumericTokenStream.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/analysis/tokenattributes/ShiftAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/ShiftAttribute.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/ShiftAttribute.java (revision 0) @@ -0,0 +1,72 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.util.Attribute; + +/** + * This attribute is updated by {@link NumericTokenStream} + * to the shift value of the current prefix-encoded token. + * It may be used by filters or consumers to e.g. distribute the values to various fields. + * @since 2.9 + */ +public final class ShiftAttribute extends Attribute implements Cloneable, Serializable { + private int shift = 0; + + /** + * Returns the shift value of the current prefix encoded token. + */ + public int getShift() { + return shift; + } + + /** + * Sets the shift value. + */ + public void setShift(final int shift) { + this.shift = shift; + } + + public void clear() { + shift = 0; + } + + public String toString() { + return "shift=" + shift; + } + + public boolean equals(Object other) { + if (this == other) return true; + if (other instanceof ShiftAttribute) { + return ((ShiftAttribute) other).shift == shift; + } + return false; + } + + public int hashCode() { + return shift; + } + + public void copyTo(Attribute target) { + final ShiftAttribute t = (ShiftAttribute) target; + t.setShift(shift); + } +} Property changes on: src\java\org\apache\lucene\analysis\tokenattributes\ShiftAttribute.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/NumericRangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/NumericRangeFilter.java (revision 0) +++ src/java/org/apache/lucene/search/NumericRangeFilter.java (revision 0) @@ -0,0 +1,84 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs + +/** + * Implementation of a Lucene {@link Filter} that implements trie-based range filtering + * for numeric values. For more information about the algorithm look into the docs of + * {@link NumericRangeQuery}. + * + *

This filter depends on a specific structure of terms in the index that can only be created + * by indexing using {@link NumericTokenStream}. + * @since 2.9 + **/ +public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { + + private NumericRangeFilter(final NumericRangeQuery query) { + super(query); + } + + public static NumericRangeFilter newLongRange(final String field, final int precisionStep, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + public static NumericRangeFilter newIntRange(final String field, final int precisionStep, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + public static NumericRangeFilter newDoubleRange(final String field, final int precisionStep, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + public static NumericRangeFilter newFloatRange(final String field, final int precisionStep, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeFilter( + NumericRangeQuery.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive) + ); + } + + /** Returns the field name for this filter */ + public String getField() { return ((NumericRangeQuery)query).getField(); } + + /** Returns true if the lower endpoint is inclusive */ + public boolean includesMin() { return ((NumericRangeQuery)query).includesMin(); } + + /** Returns true if the upper endpoint is inclusive */ + public boolean includesMax() { return ((NumericRangeQuery)query).includesMax(); } + + /** Returns the lower value of this range filter */ + public Number getMin() { return ((NumericRangeQuery)query).getMin(); } + + /** Returns the upper value of this range filter */ + public Number getMax() { return ((NumericRangeQuery)query).getMax(); } + +} Property changes on: src\java\org\apache\lucene\search\NumericRangeFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/NumericRangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/NumericRangeQuery.java (revision 0) +++ src/java/org/apache/lucene/search/NumericRangeQuery.java (revision 0) @@ -0,0 +1,359 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.LinkedList; + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.util.TrieUtils; +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +/** + * Implementation of a Lucene {@link Query} that implements trie-based range querying + * for numeric values. This query depends on a specific structure of terms in the index + * that can only be created by indexing using {@link NumericTokenStream}. + * + *

See the publication about panFMP, + * where this algorithm was described: + * + *

Schindler, U, Diepenbroek, M, 2008. + * Generic XML-based Framework for Metadata Portals. + * Computers & Geosciences 34 (12), 1947-1955. + * doi:10.1016/j.cageo.2008.02.023
+ * + *

A quote from this paper: Because Apache Lucene is a full-text + * search engine and not a conventional database, it cannot handle numerical ranges + * (e.g., field value is inside user defined bounds, even dates are numerical values). + * We have developed an extension to Apache Lucene that stores + * the numerical values in a special string-encoded format with variable precision + * (all numerical values like doubles, longs, floats, and ints are converted to + * lexicographic sortable string representations and stored with different precisions. + * For a more detailed description of how the values are stored, + * see {@link TrieUtils}. A range is then divided recursively into multiple intervals for searching: + * The center of the range is searched only with the lowest possible precision in the trie, + * while the boundaries are matched more exactly. This reduces the number of terms dramatically.

+ * + *

For the variant that stores long values in 8 different precisions (each reduced by 8 bits) that + * uses a lowest precision of 1 byte, the index contains only a maximum of 256 distinct values in the + * lowest precision. Overall, a range could consist of a theoretical maximum of + * 7*255*2 + 255 = 3825 distinct terms (when there is a term for every distinct value of an + * 8-byte-number in the index and the range covers almost all of them; a maximum of 255 distinct values is used + * because it would always be possible to reduce the full 256 values to one term with degraded precision). + * In practise, we have seen up to 300 terms in most cases (index with 500,000 metadata records + * and a uniform value distribution).

+ * + *

You can choose any precisionStep when encoding values. + * Lower step values mean more precisions and so more terms in index (and index gets larger). + * On the other hand, the maximum number of terms to match reduces, which optimized query speed. + * The formula to calculate the maximum term count is: + *

+ *  n = [ (bitsPerValue/precisionStep - 1) * (2^precisionStep - 1 ) * 2 ] + (2^precisionStep - 1 )
+ * 
+ *

(this formula is only correct, when bitsPerValue/precisionStep is an integer; + * in other cases, the value must be rounded up and the last summand must contain the modulo of the division as + * precision step). + * For longs stored using a precision step of 4, n = 15*15*2 + 15 = 465, and for a precision + * step of 2, n = 31*3*2 + 3 = 189. But the faster search speed is reduced by more seeking + * in the term enum of the index. Because of this, the ideal precisionStep value can only + * be found out by testing. Important: You can index with a lower precision step value and test search speed + * using a multiple of the original step value.

+ * + *

This dramatically improves the performance of Apache Lucene with range queries, which + * are no longer dependent on the index size and the number of distinct values because there is + * an upper limit unrelated to either of these properties.

+ * + *

Comparisions of the different types of RangeQueries on an index with about 500,000 docs showed + * that the old {@link RangeQuery} (with raised {@link BooleanQuery} clause count) took about 30-40 + * secs to complete, {@link ConstantScoreRangeQuery} took 5 secs and executing + * this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit precision step). + * This query type was developed for a geographic portal, where the performance for + * e.g. bounding boxes or exact date/time stamps is important.

+ * + *

The query is in {@linkplain #setConstantScoreRewrite constant score mode} per default. + * With precision steps of ≤4, this query can be run in conventional {@link BooleanQuery} + * rewrite mode without changing the max clause count. + * @since 2.9 + **/ +public final class NumericRangeQuery extends MultiTermQuery { + + private NumericRangeQuery(final String field, final int precisionStep, final int valSize, + Number min, Number max, final boolean minInclusive, final boolean maxInclusive + ) { + assert (valSize == 32 || valSize == 64); + if (precisionStep < 1 || precisionStep > valSize) + throw new IllegalArgumentException("precisionStep may only be 1.."+valSize); + this.field = field.intern(); + this.precisionStep = precisionStep; + this.valSize = valSize; + this.min = min; + this.max = max; + this.minInclusive = minInclusive; + this.maxInclusive = maxInclusive; + setConstantScoreRewrite(true); + } + + public static NumericRangeQuery newLongRange(final String field, final int precisionStep, + Long min, Long max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); + } + + public static NumericRangeQuery newIntRange(final String field, final int precisionStep, + Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); + } + + public static NumericRangeQuery newDoubleRange(final String field, final int precisionStep, + Double min, Double max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); + } + + public static NumericRangeQuery newFloatRange(final String field, final int precisionStep, + Float min, Float max, final boolean minInclusive, final boolean maxInclusive + ) { + return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); + } + + //@Override + protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { + return new NumericRangeTermEnum(reader); + } + + /** Returns the field name for this query */ + public String getField() { return field; } + + /** Returns true if the lower endpoint is inclusive */ + public boolean includesMin() { return minInclusive; } + + /** Returns true if the upper endpoint is inclusive */ + public boolean includesMax() { return maxInclusive; } + + /** Returns the lower value of this range query */ + public Number getMin() { return min; } + + /** Returns the upper value of this range query */ + public Number getMax() { return max; } + + //@Override + public String toString(final String field) { + final StringBuffer sb = new StringBuffer(); + if (!this.field.equals(field)) sb.append(this.field).append(':'); + return sb.append(minInclusive ? '[' : '{') + .append((min == null) ? "*" : min.toString()) + .append(" TO ") + .append((max == null) ? "*" : max.toString()) + .append(maxInclusive ? ']' : '}') + .append(ToStringUtils.boost(getBoost())) + .toString(); + } + + //@Override + public final boolean equals(final Object o) { + if (o==this) return true; + if (o==null) return false; + if (o instanceof NumericRangeQuery) { + final NumericRangeQuery q=(NumericRangeQuery)o; + return ( + field==q.field && + (q.min == null ? min == null : q.min.equals(min)) && + (q.max == null ? max == null : q.max.equals(max)) && + minInclusive == q.minInclusive && + maxInclusive == q.maxInclusive && + precisionStep == q.precisionStep && + getBoost() == q.getBoost() + ); + } + return false; + } + + //@Override + public final int hashCode() { + int hash = Float.floatToIntBits(getBoost()) ^ field.hashCode(); + hash += precisionStep^0x64365465; + if (min != null) hash += min.hashCode()^0x14fa55fb; + if (max != null) hash += max.hashCode()^0x733fa5fe; + return hash+ + (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ + (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); + } + + // members (package private, to be also fast accessible by NumericRangeTermEnum) + final String field; + final int precisionStep, valSize; + final Number min, max; + final boolean minInclusive,maxInclusive; + + /** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * sub-ranges for trie range queries. + *

+ * WARNING: This term enumeration is not guaranteed to be always ordered by + * {@link Term#compareTo}. + * The ordering depends on how {@link TrieUtils#splitLongRange} and + * {@link TrieUtils#splitIntRange} generates the sub-ranges. For + * {@link MultiTermQuery} ordering is not relevant. + */ + private final class NumericRangeTermEnum extends FilteredTermEnum { + + private final IndexReader reader; + private final LinkedList/**/ rangeBounds = new LinkedList/**/(); + private String currentUpperBound = null; + + NumericRangeTermEnum(final IndexReader reader) throws IOException { + this.reader = reader; + + switch (valSize) { + case 64: { + // lower + long minBound = Long.MIN_VALUE; + if (min instanceof Long) { + minBound = min.longValue(); + } else if (min instanceof Double) { + minBound = TrieUtils.doubleToSortableLong(min.doubleValue()); + } + if (!minInclusive && min != null) minBound++; + + // upper + long maxBound = Long.MAX_VALUE; + if (max instanceof Long) { + maxBound = max.longValue(); + } else if (max instanceof Double) { + maxBound = TrieUtils.doubleToSortableLong(max.doubleValue()); + } + if (!maxInclusive && max != null) maxBound--; + + TrieUtils.splitLongRange(new TrieUtils.LongRangeBuilder() { + //@Override + public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + rangeBounds.add(minPrefixCoded); + rangeBounds.add(maxPrefixCoded); + } + }, precisionStep, minBound, maxBound); + break; + } + + case 32: { + // lower + int minBound = Integer.MIN_VALUE; + if (min instanceof Integer) { + minBound = min.intValue(); + } else if (min instanceof Float) { + minBound = TrieUtils.floatToSortableInt(min.floatValue()); + } + if (!minInclusive && min != null) minBound++; + + // upper + int maxBound = Integer.MAX_VALUE; + if (max instanceof Integer) { + maxBound = max.intValue(); + } else if (max instanceof Float) { + maxBound = TrieUtils.floatToSortableInt(max.floatValue()); + } + if (!maxInclusive && max != null) maxBound--; + + TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() { + //@Override + public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + rangeBounds.add(minPrefixCoded); + rangeBounds.add(maxPrefixCoded); + } + }, precisionStep, minBound, maxBound); + break; + } + + default: + // should never happen + throw new IllegalArgumentException("valSize must be 32 or 64"); + } + + // seek to first term + next(); + } + + //@Override + public float difference() { + return 1.0f; + } + + /** this is a dummy, it is not used by this class. */ + //@Override + protected boolean endEnum() { + assert false; // should never be called + return (currentTerm != null); + } + + /** + * Compares if current upper bound is reached, + * this also updates the term count for statistics. + * In contrast to {@link FilteredTermEnum}, a return value + * of false ends iterating the current enum + * and forwards to the next sub-range. + */ + //@Override + protected boolean termCompare(Term term) { + return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0); + } + + /** Increments the enumeration to the next element. True if one exists. */ + //@Override + public boolean next() throws IOException { + // if a current term exists, the actual enum is initialized: + // try change to next term, if no such term exists, fall-through + if (currentTerm != null) { + assert actualEnum!=null; + if (actualEnum.next()) { + currentTerm = actualEnum.term(); + if (termCompare(currentTerm)) return true; + } + } + // if all above fails, we go forward to the next enum, + // if one is available + currentTerm = null; + if (rangeBounds.size() < 2) return false; + // close the current enum and read next bounds + if (actualEnum != null) { + actualEnum.close(); + actualEnum = null; + } + final String lowerBound = (String)rangeBounds.removeFirst(); + this.currentUpperBound = (String)rangeBounds.removeFirst(); + // this call recursively uses next(), if no valid term in + // next enum found. + // if this behavior is changed/modified in the superclass, + // this enum will not work anymore! + setEnum(reader.terms(new Term(field, lowerBound))); + return (currentTerm != null); + } + + /** Closes the enumeration to further activity, freeing resources. */ + //@Override + public void close() throws IOException { + rangeBounds.clear(); + currentUpperBound = null; + super.close(); + } + + } + +} Property changes on: src\java\org\apache\lucene\search\NumericRangeQuery.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/search/RangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/RangeFilter.java (revision 784883) +++ src/java/org/apache/lucene/search/RangeFilter.java (working copy) @@ -22,8 +22,12 @@ /** * A Filter that restricts search results to a range of values in a given * field. - * - * If you construct a large number of range filters with different ranges but on the + * + *

This filter matches the documents looking for terms that fall into the + * supplied range according to {@link String#compareTo(String)}. It is not intended + * for numerical ranges, use {@link NumericRangeFilter} instead. + * + *

If you construct a large number of range filters with different ranges but on the * same field, {@link FieldCacheRangeFilter} may have significantly better performance. */ public class RangeFilter extends MultiTermQueryWrapperFilter { Index: src/java/org/apache/lucene/search/RangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/RangeQuery.java (revision 784883) +++ src/java/org/apache/lucene/search/RangeQuery.java (working copy) @@ -26,7 +26,11 @@ /** * A Query that matches documents within an exclusive range. * - * See {@link MultiTermQuery#setConstantScoreRewrite} for the tradeoffs between + *

This query matches the documents looking for terms that fall into the + * supplied range according to {@link String#compareTo(String)}. It is not intended + * for numerical ranges, use {@link NumericRangeQuery} instead. + * + *

See {@link MultiTermQuery#setConstantScoreRewrite} for the tradeoffs between * enabling and disabling constantScoreRewrite mode. */ Index: src/java/org/apache/lucene/util/TrieUtils.java =================================================================== --- src/java/org/apache/lucene/util/TrieUtils.java (revision 0) +++ src/java/org/apache/lucene/util/TrieUtils.java (revision 0) @@ -0,0 +1,492 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.NumericTokenStream; // for javadocs +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.ExtendedFieldCache; + +/** + * This is a helper class to generate prefix-encoded representations for numerical values + * and supplies converters to represent float/double values as sortable integers/longs. + * + *

To quickly execute range queries in Apache Lucene, a range is divided recursively + * into multiple intervals for searching: The center of the range is searched only with + * the lowest possible precision in the trie, while the boundaries are matched + * more exactly. This reduces the number of terms dramatically. + * + *

This class generates terms to achive this: First the numerical integer values need to + * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned + * and the bits are converted to ASCII chars with each 7 bit. The resulting string is + * sortable like the original integer value. Each value is also prefixed + * (in the first char) by the shift value (number of bits removed) used + * during encoding. + * + *

To also index floating point numbers, this class supplies two methods to convert them + * to integer values by changing their bit layout: {@link #doubleToSortableLong}, + * {@link #floatToSortableInt}. You will have no precision loss by + * converting floating point numbers to integers and back (only that the integer form + * is not usable). Other data types like dates can easily converted to longs or ints (e.g. + * date to long: {@link java.util.Date#getTime}). + * + *

For easy usage, the trie algorithm is implemented for indexing inside + * {@link NumericTokenStream} that can index int, long, + * float, and double. For querying, + * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part + * for the same data types. + * + *

This class can also be used, to generate lexicographically sortable (according + * {@link String#compareTo(String)}) representations of numeric data types for other + * usages (e.g. sorting). + * + *

Prefix encoded fields can also be sorted using the {@link SortField} factories + * {@link #getLongSortField} or {@link #getIntSortField}. + * @since 2.9 + */ +public final class TrieUtils { + + private TrieUtils() {} // no instance! + + /** + * Longs are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_LONG+shift in the first character + */ + public static final char SHIFT_START_LONG = (char)0x20; + + /** + * Expert: The maximum term length (used for char[] buffer size) + * for encoding long values. + * @see #longToPrefixCoded(long,int,char[]) + */ + public static final int LONG_BUF_SIZE = 63/7 + 2; + + /** + * Integers are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_INT+shift in the first character + */ + public static final char SHIFT_START_INT = (char)0x60; + + /** + * Expert: The maximum term length (used for char[] buffer size) + * for encoding int values. + * @see #intToPrefixCoded(int,int,char[]) + */ + public static final int INT_BUF_SIZE = 31/7 + 2; + + /** + * A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as longs. + */ + public static final ExtendedFieldCache.LongParser FIELD_CACHE_LONG_PARSER=new ExtendedFieldCache.LongParser(){ + public final long parseLong(final String val) { + final int shift = val.charAt(0)-SHIFT_START_LONG; + if (shift>0 && shift<=63) + throw new FieldCache.StopFillCacheException(); + return prefixCodedToLong(val); + } + }; + + /** + * A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as ints. + */ + public static final FieldCache.IntParser FIELD_CACHE_INT_PARSER=new FieldCache.IntParser(){ + public final int parseInt(final String val) { + final int shift = val.charAt(0)-SHIFT_START_INT; + if (shift>0 && shift<=31) + throw new FieldCache.StopFillCacheException(); + return prefixCodedToInt(val); + } + }; + + /** + * A parser instance for filling a {@link ExtendedFieldCache}, that parses prefix encoded fields as doubles. + * This uses {@link #sortableLongToDouble} to convert the encoded long to a double. + */ + public static final ExtendedFieldCache.DoubleParser FIELD_CACHE_DOUBLE_PARSER=new ExtendedFieldCache.DoubleParser(){ + public final double parseDouble(final String val) { + final int shift = val.charAt(0)-SHIFT_START_LONG; + if (shift>0 && shift<=63) + throw new FieldCache.StopFillCacheException(); + return sortableLongToDouble(prefixCodedToLong(val)); + } + }; + + /** + * A parser instance for filling a {@link FieldCache}, that parses prefix encoded fields as floats. + * This uses {@link #sortableIntToFloat} to convert the encoded int to a float. + */ + public static final FieldCache.FloatParser FIELD_CACHE_FLOAT_PARSER=new FieldCache.FloatParser(){ + public final float parseFloat(final String val) { + final int shift = val.charAt(0)-SHIFT_START_INT; + if (shift>0 && shift<=31) + throw new FieldCache.StopFillCacheException(); + return sortableIntToFloat(prefixCodedToInt(val)); + } + }; + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link NumericTokenStream}. + * @param val the numeric value + * @param shift how many bits to strip from the right + * @param buffer that will contain the encoded chars, must be at least of {@link #LONG_BUF_SIZE} + * length + * @return number of chars written to buffer + */ + public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) { + int nChars = (63-shift)/7 + 1, len = nChars+1; + buffer[0] = (char)(SHIFT_START_LONG + shift); + long sortableBits = val ^ 0x8000000000000000L; + sortableBits >>>= shift; + while (nChars>=1) { + // Store 7 bits per character for good efficiency when UTF-8 encoding. + // The whole number is right-justified so that lucene can prefix-encode + // the terms more efficiently. + buffer[nChars--] = (char)(sortableBits & 0x7f); + sortableBits >>>= 7; + } + return len; + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link LongRangeBuilder}. + * @param val the numeric value + * @param shift how many bits to strip from the right + */ + public static String longToPrefixCoded(final long val, final int shift) { + if (shift>63 || shift<0) + throw new IllegalArgumentException("Illegal shift value, must be 0..63"); + final char[] buffer = new char[LONG_BUF_SIZE]; + final int len = longToPrefixCoded(val, shift, buffer); + return new String(buffer, 0, len); + } + + /** + * This is a convenience method, that returns prefix coded bits of a long without + * reducing the precision. It can be used to store the full precision value as a + * stored field in index. + *

To decode, use {@link #prefixCodedToLong}. + */ + public static String longToPrefixCoded(final long val) { + return longToPrefixCoded(val, 0); + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link NumericTokenStream}. + * @param val the numeric value + * @param shift how many bits to strip from the right + * @param buffer that will contain the encoded chars, must be at least of {@link #INT_BUF_SIZE} + * length + * @return number of chars written to buffer + */ + public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { + int nChars = (31-shift)/7 + 1, len = nChars+1; + buffer[0] = (char)(SHIFT_START_INT + shift); + int sortableBits = val ^ 0x80000000; + sortableBits >>>= shift; + while (nChars>=1) { + // Store 7 bits per character for good efficiency when UTF-8 encoding. + // The whole number is right-justified so that lucene can prefix-encode + // the terms more efficiently. + buffer[nChars--] = (char)(sortableBits & 0x7f); + sortableBits >>>= 7; + } + return len; + } + + /** + * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * This is method is used by {@link IntRangeBuilder}. + * @param val the numeric value + * @param shift how many bits to strip from the right + */ + public static String intToPrefixCoded(final int val, final int shift) { + if (shift>31 || shift<0) + throw new IllegalArgumentException("Illegal shift value, must be 0..31"); + final char[] buffer = new char[INT_BUF_SIZE]; + final int len = intToPrefixCoded(val, shift, buffer); + return new String(buffer, 0, len); + } + + /** + * This is a convenience method, that returns prefix coded bits of an int without + * reducing the precision. It can be used to store the full precision value as a + * stored field in index. + *

To decode, use {@link #prefixCodedToInt}. + */ + public static String intToPrefixCoded(final int val) { + return intToPrefixCoded(val, 0); + } + + /** + * Returns a long from prefixCoded characters. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode e.g. a stored field. + * @throws NumberFormatException if the supplied string is + * not correctly prefix encoded. + * @see #longToPrefixCoded(long) + */ + public static long prefixCodedToLong(final String prefixCoded) { + final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG; + if (shift>63 || shift<0) + throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)"); + long sortableBits = 0L; + for (int i=1, len=prefixCoded.length(); i0x7f) { + throw new NumberFormatException( + "Invalid prefixCoded numerical value representation (char "+ + Integer.toHexString((int)ch)+" at position "+i+" is invalid)" + ); + } + sortableBits |= (long)ch; + } + return (sortableBits << shift) ^ 0x8000000000000000L; + } + + /** + * Returns an int from prefixCoded characters. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode e.g. a stored field. + * @throws NumberFormatException if the supplied string is + * not correctly prefix encoded. + * @see #intToPrefixCoded(int) + */ + public static int prefixCodedToInt(final String prefixCoded) { + final int shift = prefixCoded.charAt(0)-SHIFT_START_INT; + if (shift>31 || shift<0) + throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)"); + int sortableBits = 0; + for (int i=1, len=prefixCoded.length(); i0x7f) { + throw new NumberFormatException( + "Invalid prefixCoded numerical value representation (char "+ + Integer.toHexString((int)ch)+" at position "+i+" is invalid)" + ); + } + sortableBits |= (int)ch; + } + return (sortableBits << shift) ^ 0x80000000; + } + + /** + * Converts a double value to a sortable signed long. + * The value is converted by getting their IEEE 754 floating-point "double format" + * bit layout and then some bits are swapped, to be able to compare the result as long. + * By this the precision is not reduced, but the value can easily used as a long. + * @see #sortableLongToDouble + */ + public static long doubleToSortableLong(double val) { + long f = Double.doubleToLongBits(val); + if (f<0) f ^= 0x7fffffffffffffffL; + return f; + } + + /** + * Converts a sortable long back to a double. + * @see #doubleToSortableLong + */ + public static double sortableLongToDouble(long val) { + if (val<0) val ^= 0x7fffffffffffffffL; + return Double.longBitsToDouble(val); + } + + /** + * Converts a float value to a sortable signed int. + * The value is converted by getting their IEEE 754 floating-point "float format" + * bit layout and then some bits are swapped, to be able to compare the result as int. + * By this the precision is not reduced, but the value can easily used as an int. + * @see #sortableIntToFloat + */ + public static int floatToSortableInt(float val) { + int f = Float.floatToIntBits(val); + if (f<0) f ^= 0x7fffffff; + return f; + } + + /** + * Converts a sortable int back to a float. + * @see #floatToSortableInt + */ + public static float sortableIntToFloat(int val) { + if (val<0) val ^= 0x7fffffff; + return Float.intBitsToFloat(val); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded long values. */ + public static SortField getLongSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_LONG_PARSER, reverse); + } + + /** A factory method, that generates a {@link SortField} instance for sorting prefix encoded int values. */ + public static SortField getIntSortField(final String field, final boolean reverse) { + return new SortField(field, FIELD_CACHE_INT_PARSER, reverse); + } + + /** + * Expert: Splits a long range recursively. + * You may implement a builder that adds clauses to a + * {@link org.apache.lucene.search.BooleanQuery} for each call to its + * {@link LongRangeBuilder#addRange(String,String)} + * method. + *

This method is used by {@link NumericRangeQuery}. + */ + public static void splitLongRange(final LongRangeBuilder builder, + final int precisionStep, final long minBound, final long maxBound + ) { + if (precisionStep<1 || precisionStep>64) + throw new IllegalArgumentException("precisionStep may only be 1..64"); + splitRange(builder, 64, precisionStep, minBound, maxBound); + } + + /** + * Expert: Splits an int range recursively. + * You may implement a builder that adds clauses to a + * {@link org.apache.lucene.search.BooleanQuery} for each call to its + * {@link IntRangeBuilder#addRange(String,String)} + * method. + *

This method is used by {@link NumericRangeQuery}. + */ + public static void splitIntRange(final IntRangeBuilder builder, + final int precisionStep, final int minBound, final int maxBound + ) { + if (precisionStep<1 || precisionStep>32) + throw new IllegalArgumentException("precisionStep may only be 1..32"); + splitRange(builder, 32, precisionStep, (long)minBound, (long)maxBound); + } + + /** This helper does the splitting for both 32 and 64 bit. */ + private static void splitRange( + final Object builder, final int valSize, + final int precisionStep, long minBound, long maxBound + ) { + if (minBound > maxBound) return; + for (int shift=0; ; shift += precisionStep) { + // calculate new bounds for inner precision + final long diff = 1L << (shift+precisionStep), + mask = ((1L<=valSize || nextMinBound>nextMaxBound) { + // We are in the lowest precision or the next precision is not available. + addRange(builder, valSize, minBound, maxBound, shift); + // exit the split recursion loop + break; + } + + if (hasLower) + addRange(builder, valSize, minBound, minBound | mask, shift); + if (hasUpper) + addRange(builder, valSize, maxBound & ~mask, maxBound, shift); + + // recurse to next precision + minBound = nextMinBound; + maxBound = nextMaxBound; + } + } + + /** Helper that delegates to correct range builder */ + private static void addRange( + final Object builder, final int valSize, + long minBound, long maxBound, + final int shift + ) { + // for the max bound set all lower bits (that were shifted away): + // this is important for testing or other usages of the splitted range + // (e.g. to reconstruct the full range). The prefixEncoding will remove + // the bits anyway, so they do not hurt! + maxBound |= (1L << shift) - 1L; + // delegate to correct range builder + switch(valSize) { + case 64: + ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift); + break; + case 32: + ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift); + break; + default: + // Should not happen! + throw new IllegalArgumentException("valSize must be 32 or 64."); + } + } + + /** + * Expert: Callback for {@link #splitLongRange}. + * You need to overwrite only one of the methods. + *

WARNING: This is a very low-level interface, + * the method signatures may change in later versions. + */ + public static abstract class LongRangeBuilder { + + /** + * Overwrite this method, if you like to receive the already prefix encoded range bounds. + * You can directly build classical (inclusive) range queries from them. + */ + public void addRange(String minPrefixCoded, String maxPrefixCoded) { + throw new UnsupportedOperationException(); + } + + /** + * Overwrite this method, if you like to receive the raw long range bounds. + * You can use this for e.g. debugging purposes (print out range bounds). + */ + public void addRange(final long min, final long max, final int shift) { + addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift)); + } + + } + + /** + * Expert: Callback for {@link #splitIntRange}. + * You need to overwrite only one of the methods. + *

WARNING: This is a very low-level interface, + * the method signatures may change in later versions. + */ + public static abstract class IntRangeBuilder { + + /** + * Overwrite this method, if you like to receive the already prefix encoded range bounds. + * You can directly build classical range (inclusive) queries from them. + */ + public void addRange(String minPrefixCoded, String maxPrefixCoded) { + throw new UnsupportedOperationException(); + } + + /** + * Overwrite this method, if you like to receive the raw int range bounds. + * You can use this for e.g. debugging purposes (print out range bounds). + */ + public void addRange(final int min, final int max, final int shift) { + addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift)); + } + + } + +} Property changes on: src\java\org\apache\lucene\util\TrieUtils.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/test/org/apache/lucene/analysis/TestNumericTokenStream.java =================================================================== --- src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 0) @@ -0,0 +1,108 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TrieUtils; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.ShiftAttribute; + +public class TestNumericTokenStream extends LuceneTestCase { + + static final int precisionStep = 8; + static final long lvalue = 4573245871874382L; + static final int ivalue = 123456; + + public void testLongStreamNewAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + stream.setUseNewAPI(true); + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + final ShiftAttribute shiftAtt = (ShiftAttribute) stream.getAttribute(ShiftAttribute.class); + final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + for (int shift=0; shift<64; shift+=precisionStep) { + assertTrue("New token is available", stream.incrementToken()); + assertEquals("Shift value", shift, shiftAtt.getShift()); + assertEquals("Term is correctly encoded", TrieUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); + } + assertFalse("No more tokens available", stream.incrementToken()); + } + + public void testLongStreamOldAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setLongValue(lvalue); + stream.setUseNewAPI(false); + Token tok=new Token(); + for (int shift=0; shift<64; shift+=precisionStep) { + assertNotNull("New token is available", tok=stream.next(tok)); + assertEquals("Term is correctly encoded", TrieUtils.longToPrefixCoded(lvalue, shift), tok.term()); + } + assertNull("No more tokens available", stream.next(tok)); + } + + public void testIntStreamNewAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + stream.setUseNewAPI(true); + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + final ShiftAttribute shiftAtt = (ShiftAttribute) stream.getAttribute(ShiftAttribute.class); + final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + for (int shift=0; shift<32; shift+=precisionStep) { + assertTrue("New token is available", stream.incrementToken()); + assertEquals("Shift value", shift, shiftAtt.getShift()); + assertEquals("Term is correctly encoded", TrieUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); + } + assertFalse("No more tokens available", stream.incrementToken()); + } + + public void testIntStreamOldAPI() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep).setIntValue(ivalue); + stream.setUseNewAPI(false); + Token tok=new Token(); + for (int shift=0; shift<32; shift+=precisionStep) { + assertNotNull("New token is available", tok=stream.next(tok)); + assertEquals("Term is correctly encoded", TrieUtils.intToPrefixCoded(ivalue, shift), tok.term()); + } + assertNull("No more tokens available", stream.next(tok)); + } + + public void testNotInitialized() throws Exception { + final NumericTokenStream stream=new NumericTokenStream(precisionStep); + + try { + stream.reset(); + fail("reset() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + + stream.setUseNewAPI(true); + try { + stream.incrementToken(); + fail("incrementToken() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + + stream.setUseNewAPI(false); + try { + stream.next(new Token()); + fail("next() should not succeed."); + } catch (IllegalStateException e) { + // pass + } + } + +} Property changes on: src\test\org\apache\lucene\analysis\TestNumericTokenStream.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 0) +++ src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 0) @@ -0,0 +1,431 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Random; + +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TrieUtils; + +public class TestNumericRangeQuery32 extends LuceneTestCase { + // distance of entries + private static final int distance = 6666; + // shift the starting of the values to the left, to also have negative values: + private static final int startOffset = - 1 << 15; + // number of docs to generate for testing + private static final int noDocs = 10000; + + private static Field newField(String name, int precisionStep) { + NumericTokenStream stream = new NumericTokenStream(precisionStep); + stream.setUseNewAPI(true); + Field f=new Field(name, stream); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + return f; + } + + private static final RAMDirectory directory; + private static final IndexSearcher searcher; + static { + try { + // set the theoretical maximum term count for 8bit (see docs for the number) + BooleanQuery.setMaxClauseCount(3*255*2 + 255); + + directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, MaxFieldLength.UNLIMITED); + + Field + field8 = newField("field8", 8), + field4 = newField("field4", 4), + field2 = newField("field2", 2), + ascfield8 = newField("ascfield8", 8), + ascfield4 = newField("ascfield4", 4), + ascfield2 = newField("ascfield2", 2); + + // Add a series of noDocs docs with increasing int values + for (int l=0; l0) { + assertEquals("Distinct term number is equal for all query types", lastTerms, terms); + } + lastTerms = terms; + } + } + + public void testRange_8bit() throws Exception { + testRange(8); + } + + public void testRange_4bit() throws Exception { + testRange(4); + } + + public void testRange_2bit() throws Exception { + testRange(2); + } + + public void testInverseRange() throws Exception { + NumericRangeFilter f = NumericRangeFilter.newIntRange("field8", 8, new Integer(1000), new Integer(-1000), true, true); + assertSame("A inverse range should return the EMPTY_DOCIDSET instance", DocIdSet.EMPTY_DOCIDSET, f.getDocIdSet(searcher.getIndexReader())); + } + + private void testLeftOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + int upper=(count-1)*distance + (distance/3) + startOffset; + NumericRangeQuery q=NumericRangeQuery.newIntRange(field, precisionStep, null, new Integer(upper), true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in left open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", startOffset, Integer.parseInt(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (count-1)*distance+startOffset, Integer.parseInt(doc.get("value")) ); + } + + public void testLeftOpenRange_8bit() throws Exception { + testLeftOpenRange(8); + } + + public void testLeftOpenRange_4bit() throws Exception { + testLeftOpenRange(4); + } + + public void testLeftOpenRange_2bit() throws Exception { + testLeftOpenRange(2); + } + + private void testRightOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + int lower=(count-1)*distance + (distance/3) +startOffset; + NumericRangeQuery q=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), null, true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in right open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", noDocs-count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", count*distance+startOffset, Integer.parseInt(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (noDocs-1)*distance+startOffset, Integer.parseInt(doc.get("value")) ); + } + + public void testRightOpenRange_8bit() throws Exception { + testRightOpenRange(8); + } + + public void testRightOpenRange_4bit() throws Exception { + testRightOpenRange(4); + } + + public void testRightOpenRange_2bit() throws Exception { + testRightOpenRange(2); + } + + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + int termCountT=0,termCountC=0; + for (int i=0; i<50; i++) { + int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + // test inclusive range + NumericRangeQuery tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + RangeQuery cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, true); + cq.setConstantScoreRewrite(true); + TopDocs tTopDocs = searcher.search(tq, 1); + TopDocs cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, false); + cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test left exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, true); + cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), false, true); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test right exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, false); + cq=new RangeQuery(field, TrieUtils.intToPrefixCoded(lower), TrieUtils.intToPrefixCoded(upper), true, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + } + System.out.println("Average number of terms during random search on '" + field + "':"); + System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); + System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + } + + public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception { + testRandomTrieAndClassicRangeQuery(8); + } + + public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception { + testRandomTrieAndClassicRangeQuery(4); + } + + public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception { + testRandomTrieAndClassicRangeQuery(2); + } + + private void testRangeSplit(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="ascfield"+precisionStep; + // 50 random tests + for (int i=0; i<50; i++) { + int lower=(int)(rnd.nextDouble()*noDocs - noDocs/2); + int upper=(int)(rnd.nextDouble()*noDocs - noDocs/2); + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + // test inclusive range + Query tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + // test exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits ); + // test left exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), false, true); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + // test right exclusive range + tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + } + } + + public void testRangeSplit_8bit() throws Exception { + testRangeSplit(8); + } + + public void testRangeSplit_4bit() throws Exception { + testRangeSplit(4); + } + + public void testRangeSplit_2bit() throws Exception { + testRangeSplit(2); + } + + /** we fake a float test using int2float conversion of TrieUtils */ + private void testFloatRange(int precisionStep) throws Exception { + final String field="ascfield"+precisionStep; + final int lower=-1000, upper=+2000; + + Query tq=NumericRangeQuery.newFloatRange(field, precisionStep, + new Float(TrieUtils.sortableIntToFloat(lower)), new Float(TrieUtils.sortableIntToFloat(upper)), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + + Filter tf=NumericRangeFilter.newFloatRange(field, precisionStep, + new Float(TrieUtils.sortableIntToFloat(lower)), new Float(TrieUtils.sortableIntToFloat(upper)), true, true); + tTopDocs = searcher.search(new MatchAllDocsQuery(), tf, 1); + assertEquals("Returned count of range filter must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + } + + public void testFloatRange_8bit() throws Exception { + testFloatRange(8); + } + + public void testFloatRange_4bit() throws Exception { + testFloatRange(4); + } + + public void testFloatRange_2bit() throws Exception { + testFloatRange(2); + } + + private void testSorting(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + // 10 random tests, the index order is ascending, + // so using a reverse sort field should retun descending documents + for (int i=0; i<10; i++) { + int lower=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + int upper=(int)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + int a=lower; lower=upper; upper=a; + } + Query tq=NumericRangeQuery.newIntRange(field, precisionStep, new Integer(lower), new Integer(upper), true, true); + TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(TrieUtils.getIntSortField(field, true))); + if (topDocs.totalHits==0) continue; + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + int last=Integer.parseInt(searcher.doc(sd[0].doc).get("value")); + for (int j=1; jact ); + last=act; + } + } + } + + public void testSorting_8bit() throws Exception { + testSorting(8); + } + + public void testSorting_4bit() throws Exception { + testSorting(4); + } + + public void testSorting_2bit() throws Exception { + testSorting(2); + } + + public void testEqualsAndHash() throws Exception { + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test1", 4, new Integer(10), new Integer(20), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test2", 4, new Integer(10), new Integer(20), false, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test3", 4, new Integer(10), new Integer(20), true, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test4", 4, new Integer(10), new Integer(20), false, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test5", 4, new Integer(10), null, true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test6", 4, null, new Integer(20), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newIntRange("test7", 4, null, null, true, true)); + QueryUtils.checkEqual( + NumericRangeQuery.newIntRange("test8", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test8", 4, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test9", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test9", 8, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test10a", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test10b", 4, new Integer(10), new Integer(20), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test11", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test11", 4, new Integer(20), new Integer(10), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test12", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newIntRange("test12", 4, new Integer(10), new Integer(20), false, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newIntRange("test13", 4, new Integer(10), new Integer(20), true, true), + NumericRangeQuery.newFloatRange("test13", 4, new Float(10f), new Float(20f), true, true) + ); + // the following produces a hash collision, because Long and Integer have the same hashcode, so only test equality: + Query q1 = NumericRangeQuery.newIntRange("test14", 4, new Integer(10), new Integer(20), true, true); + Query q2 = NumericRangeQuery.newLongRange("test14", 4, new Long(10L), new Long(20L), true, true); + assertFalse(q1.equals(q2)); + assertFalse(q2.equals(q1)); + } + +} Property changes on: src\test\org\apache\lucene\search\TestNumericRangeQuery32.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/test/org/apache/lucene/search/TestNumericRangeQuery64.java =================================================================== --- src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 0) +++ src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 0) @@ -0,0 +1,427 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Random; + +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TrieUtils; + +public class TestNumericRangeQuery64 extends LuceneTestCase { + // distance of entries + private static final long distance = 66666L; + // shift the starting of the values to the left, to also have negative values: + private static final long startOffset = - 1L << 31; + // number of docs to generate for testing + private static final int noDocs = 10000; + + private static Field newField(String name, int precisionStep) { + NumericTokenStream stream = new NumericTokenStream(precisionStep); + stream.setUseNewAPI(true); + Field f=new Field(name, stream); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + return f; + } + + private static final RAMDirectory directory; + private static final IndexSearcher searcher; + static { + try { + // set the theoretical maximum term count for 8bit (see docs for the number) + BooleanQuery.setMaxClauseCount(7*255*2 + 255); + + directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), + true, MaxFieldLength.UNLIMITED); + + Field + field8 = newField("field8", 8), + field4 = newField("field4", 4), + field2 = newField("field2", 2), + ascfield8 = newField("ascfield8", 8), + ascfield4 = newField("ascfield4", 4), + ascfield2 = newField("ascfield2", 2); + + // Add a series of noDocs docs with increasing long values + for (int l=0; l0) { + assertEquals("Distinct term number is equal for all query types", lastTerms, terms); + } + lastTerms = terms; + } + } + + public void testRange_8bit() throws Exception { + testRange(8); + } + + public void testRange_4bit() throws Exception { + testRange(4); + } + + public void testRange_2bit() throws Exception { + testRange(2); + } + + public void testInverseRange() throws Exception { + NumericRangeFilter f = NumericRangeFilter.newLongRange("field8", 8, new Long(1000L), new Long(-1000L), true, true); + assertSame("A inverse range should return the EMPTY_DOCIDSET instance", DocIdSet.EMPTY_DOCIDSET, f.getDocIdSet(searcher.getIndexReader())); + } + + private void testLeftOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + long upper=(count-1)*distance + (distance/3) + startOffset; + NumericRangeQuery q=NumericRangeQuery.newLongRange(field, precisionStep, null, new Long(upper), true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in left open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", startOffset, Long.parseLong(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (count-1)*distance+startOffset, Long.parseLong(doc.get("value")) ); + } + + public void testLeftOpenRange_8bit() throws Exception { + testLeftOpenRange(8); + } + + public void testLeftOpenRange_4bit() throws Exception { + testLeftOpenRange(4); + } + + public void testLeftOpenRange_2bit() throws Exception { + testLeftOpenRange(2); + } + + private void testRightOpenRange(int precisionStep) throws Exception { + String field="field"+precisionStep; + int count=3000; + long lower=(count-1)*distance + (distance/3) +startOffset; + NumericRangeQuery q=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), null, true, true); + TopDocs topDocs = searcher.search(q, null, noDocs, Sort.INDEXORDER); + System.out.println("Found "+q.getTotalNumberOfTerms()+" distinct terms in right open range for field '"+field+"'."); + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + assertEquals("Score doc count", noDocs-count, sd.length ); + Document doc=searcher.doc(sd[0].doc); + assertEquals("First doc", count*distance+startOffset, Long.parseLong(doc.get("value")) ); + doc=searcher.doc(sd[sd.length-1].doc); + assertEquals("Last doc", (noDocs-1)*distance+startOffset, Long.parseLong(doc.get("value")) ); + } + + public void testRightOpenRange_8bit() throws Exception { + testRightOpenRange(8); + } + + public void testRightOpenRange_4bit() throws Exception { + testRightOpenRange(4); + } + + public void testRightOpenRange_2bit() throws Exception { + testRightOpenRange(2); + } + + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + int termCountT=0,termCountC=0; + for (int i=0; i<50; i++) { + long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + // test inclusive range + NumericRangeQuery tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + RangeQuery cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, true); + cq.setConstantScoreRewrite(true); + TopDocs tTopDocs = searcher.search(tq, 1); + TopDocs cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, false); + cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test left exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, true); + cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), false, true); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + // test right exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, false); + cq=new RangeQuery(field, TrieUtils.longToPrefixCoded(lower), TrieUtils.longToPrefixCoded(upper), true, false); + cq.setConstantScoreRewrite(true); + tTopDocs = searcher.search(tq, 1); + cTopDocs = searcher.search(cq, 1); + assertEquals("Returned count for NumericRangeQuery and RangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); + termCountT += tq.getTotalNumberOfTerms(); + termCountC += cq.getTotalNumberOfTerms(); + } + System.out.println("Average number of terms during random search on '" + field + "':"); + System.out.println(" Trie query: " + (((double)termCountT)/(50*4))); + System.out.println(" Classical query: " + (((double)termCountC)/(50*4))); + } + + public void testRandomTrieAndClassicRangeQuery_8bit() throws Exception { + testRandomTrieAndClassicRangeQuery(8); + } + + public void testRandomTrieAndClassicRangeQuery_4bit() throws Exception { + testRandomTrieAndClassicRangeQuery(4); + } + + public void testRandomTrieAndClassicRangeQuery_2bit() throws Exception { + testRandomTrieAndClassicRangeQuery(2); + } + + private void testRangeSplit(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="ascfield"+precisionStep; + // 50 random tests + for (int i=0; i<50; i++) { + long lower=(long)(rnd.nextDouble()*noDocs - noDocs/2); + long upper=(long)(rnd.nextDouble()*noDocs - noDocs/2); + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + // test inclusive range + Query tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + // test exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to exclusive range length", Math.max(upper-lower-1, 0), tTopDocs.totalHits ); + // test left exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), false, true); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + // test right exclusive range + tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, false); + tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to half exclusive range length", upper-lower, tTopDocs.totalHits ); + } + } + + public void testRangeSplit_8bit() throws Exception { + testRangeSplit(8); + } + + public void testRangeSplit_4bit() throws Exception { + testRangeSplit(4); + } + + public void testRangeSplit_2bit() throws Exception { + testRangeSplit(2); + } + + /** we fake a double test using long2double conversion of TrieUtils */ + private void testDoubleRange(int precisionStep) throws Exception { + final String field="ascfield"+precisionStep; + final long lower=-1000L, upper=+2000L; + + Query tq=NumericRangeQuery.newDoubleRange(field, precisionStep, + new Double(TrieUtils.sortableLongToDouble(lower)), new Double(TrieUtils.sortableLongToDouble(upper)), true, true); + TopDocs tTopDocs = searcher.search(tq, 1); + assertEquals("Returned count of range query must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + + Filter tf=NumericRangeFilter.newDoubleRange(field, precisionStep, + new Double(TrieUtils.sortableLongToDouble(lower)), new Double(TrieUtils.sortableLongToDouble(upper)), true, true); + tTopDocs = searcher.search(new MatchAllDocsQuery(), tf, 1); + assertEquals("Returned count of range filter must be equal to inclusive range length", upper-lower+1, tTopDocs.totalHits ); + } + + public void testDoubleRange_8bit() throws Exception { + testDoubleRange(8); + } + + public void testDoubleRange_4bit() throws Exception { + testDoubleRange(4); + } + + public void testDoubleRange_2bit() throws Exception { + testDoubleRange(2); + } + + private void testSorting(int precisionStep) throws Exception { + final Random rnd=newRandom(); + String field="field"+precisionStep; + // 10 random tests, the index order is ascending, + // so using a reverse sort field should retun descending documents + for (int i=0; i<10; i++) { + long lower=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + long upper=(long)(rnd.nextDouble()*noDocs*distance)+startOffset; + if (lower>upper) { + long a=lower; lower=upper; upper=a; + } + Query tq=NumericRangeQuery.newLongRange(field, precisionStep, new Long(lower), new Long(upper), true, true); + TopDocs topDocs = searcher.search(tq, null, noDocs, new Sort(TrieUtils.getLongSortField(field, true))); + if (topDocs.totalHits==0) continue; + ScoreDoc[] sd = topDocs.scoreDocs; + assertNotNull(sd); + long last=Long.parseLong(searcher.doc(sd[0].doc).get("value")); + for (int j=1; jact ); + last=act; + } + } + } + + public void testSorting_8bit() throws Exception { + testSorting(8); + } + + public void testSorting_4bit() throws Exception { + testSorting(4); + } + + public void testSorting_2bit() throws Exception { + testSorting(2); + } + + public void testEqualsAndHash() throws Exception { + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test1", 4, new Long(10L), new Long(20L), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test2", 4, new Long(10L), new Long(20L), false, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test3", 4, new Long(10L), new Long(20L), true, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test4", 4, new Long(10L), new Long(20L), false, false)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test5", 4, new Long(10L), null, true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test6", 4, null, new Long(20L), true, true)); + QueryUtils.checkHashEquals(NumericRangeQuery.newLongRange("test7", 4, null, null, true, true)); + QueryUtils.checkEqual( + NumericRangeQuery.newLongRange("test8", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test8", 4, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test9", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test9", 8, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test10a", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test10b", 4, new Long(10L), new Long(20L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test11", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test11", 4, new Long(20L), new Long(10L), true, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test12", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newLongRange("test12", 4, new Long(10L), new Long(20L), false, true) + ); + QueryUtils.checkUnequal( + NumericRangeQuery.newLongRange("test13", 4, new Long(10L), new Long(20L), true, true), + NumericRangeQuery.newFloatRange("test13", 4, new Float(10f), new Float(20f), true, true) + ); + // difference to int range is tested in TestNumericRangeQuery32 + } + +} Property changes on: src\test\org\apache\lucene\search\TestNumericRangeQuery64.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/test/org/apache/lucene/util/TestTrieUtils.java =================================================================== --- src/test/org/apache/lucene/util/TestTrieUtils.java (revision 0) +++ src/test/org/apache/lucene/util/TestTrieUtils.java (revision 0) @@ -0,0 +1,339 @@ +package org.apache.lucene.util; + +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +public class TestTrieUtils extends LuceneTestCase { + + public void testLongConversionAndOrdering() throws Exception { + // generate a series of encoded longs, each numerical one bigger than the one before + String last=null; + for (long l=-100000L; l<100000L; l++) { + String act=TrieUtils.longToPrefixCoded(l); + if (last!=null) { + // test if smaller + assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + } + // test is back and forward conversion works + assertEquals("forward and back conversion should generate same long", l, TrieUtils.prefixCodedToLong(act)); + // next step + last=act; + } + } + + public void testIntConversionAndOrdering() throws Exception { + // generate a series of encoded ints, each numerical one bigger than the one before + String last=null; + for (int i=-100000; i<100000; i++) { + String act=TrieUtils.intToPrefixCoded(i); + if (last!=null) { + // test if smaller + assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + } + // test is back and forward conversion works + assertEquals("forward and back conversion should generate same int", i, TrieUtils.prefixCodedToInt(act)); + // next step + last=act; + } + } + + public void testLongSpecialValues() throws Exception { + long[] vals=new long[]{ + Long.MIN_VALUE, Long.MIN_VALUE+1, Long.MIN_VALUE+2, -5003400000000L, + -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE + }; + String[] prefixVals=new String[vals.length]; + + for (int i=0; i=lower && min<=upper && max>=lower && max<=upper); + if (useBitSet) for (long l=min; l<=max; l++) { + assertFalse("ranges should not overlap", bits.getAndSet(l-lower) ); + } + // make unsigned longs for easier display and understanding + min ^= 0x8000000000000000L; + max ^= 0x8000000000000000L; + //System.out.println("new Long(0x"+Long.toHexString(min>>>shift)+"L),new Long(0x"+Long.toHexString(max>>>shift)+"L),"); + assertEquals( "inner min bound", ((Long)neededBounds.next()).longValue(), min>>>shift); + assertEquals( "inner max bound", ((Long)neededBounds.next()).longValue(), max>>>shift); + } + }, precisionStep, lower, upper); + + if (useBitSet) { + // after flipping all bits in the range, the cardinality should be zero + bits.flip(0,upper-lower+1); + assertTrue("The sub-range concenated should match the whole range", bits.isEmpty()); + } + } + + public void testSplitLongRange() throws Exception { + // a hard-coded "standard" range + assertLongRangeSplit(-5000L, 9500L, 4, true, Arrays.asList(new Long[]{ + new Long(0x7fffffffffffec78L),new Long(0x7fffffffffffec7fL), + new Long(0x8000000000002510L),new Long(0x800000000000251cL), + new Long(0x7fffffffffffec8L), new Long(0x7fffffffffffecfL), + new Long(0x800000000000250L), new Long(0x800000000000250L), + new Long(0x7fffffffffffedL), new Long(0x7fffffffffffefL), + new Long(0x80000000000020L), new Long(0x80000000000024L), + new Long(0x7ffffffffffffL), new Long(0x8000000000001L) + }).iterator()); + + // the same with no range splitting + assertLongRangeSplit(-5000L, 9500L, 64, true, Arrays.asList(new Long[]{ + new Long(0x7fffffffffffec78L),new Long(0x800000000000251cL) + }).iterator()); + + // this tests optimized range splitting, if one of the inner bounds + // is also the bound of the next lower precision, it should be used completely + assertLongRangeSplit(0L, 1024L+63L, 4, true, Arrays.asList(new Long[]{ + new Long(0x800000000000040L), new Long(0x800000000000043L), + new Long(0x80000000000000L), new Long(0x80000000000003L) + }).iterator()); + + // the full long range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-) + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 8, false, Arrays.asList(new Long[]{ + new Long(0x00L),new Long(0xffL) + }).iterator()); + + // the same with precisionStep=4 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 4, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0xfL) + }).iterator()); + + // the same with precisionStep=2 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 2, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0x3L) + }).iterator()); + + // the same with precisionStep=1 + assertLongRangeSplit(Long.MIN_VALUE, Long.MAX_VALUE, 1, false, Arrays.asList(new Long[]{ + new Long(0x0L),new Long(0x1L) + }).iterator()); + + // a inverse range should produce no sub-ranges + assertLongRangeSplit(9500L, -5000L, 4, false, Collections.EMPTY_LIST.iterator()); + + // a 0-length range should reproduce the range itsself + assertLongRangeSplit(9500L, 9500L, 4, false, Arrays.asList(new Long[]{ + new Long(0x800000000000251cL),new Long(0x800000000000251cL) + }).iterator()); + } + + /** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */ + protected void assertIntRangeSplit(final int lower, final int upper, int precisionStep, + final boolean useBitSet, final Iterator neededBounds + ) throws Exception { + final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null; + + TrieUtils.splitIntRange(new TrieUtils.IntRangeBuilder() { + //@Override + public void addRange(int min, int max, int shift) { + assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper); + if (useBitSet) for (int i=min; i<=max; i++) { + assertFalse("ranges should not overlap", bits.getAndSet(i-lower) ); + } + // make unsigned ints for easier display and understanding + min ^= 0x80000000; + max ^= 0x80000000; + //System.out.println("new Integer(0x"+Integer.toHexString(min>>>shift)+"),new Integer(0x"+Integer.toHexString(max>>>shift)+"),"); + assertEquals( "inner min bound", ((Integer)neededBounds.next()).intValue(), min>>>shift); + assertEquals( "inner max bound", ((Integer)neededBounds.next()).intValue(), max>>>shift); + } + }, precisionStep, lower, upper); + + if (useBitSet) { + // after flipping all bits in the range, the cardinality should be zero + bits.flip(0,upper-lower+1); + assertTrue("The sub-range concenated should match the whole range", bits.isEmpty()); + } + } + + public void testSplitIntRange() throws Exception { + // a hard-coded "standard" range + assertIntRangeSplit(-5000, 9500, 4, true, Arrays.asList(new Integer[]{ + new Integer(0x7fffec78),new Integer(0x7fffec7f), + new Integer(0x80002510),new Integer(0x8000251c), + new Integer(0x7fffec8), new Integer(0x7fffecf), + new Integer(0x8000250), new Integer(0x8000250), + new Integer(0x7fffed), new Integer(0x7fffef), + new Integer(0x800020), new Integer(0x800024), + new Integer(0x7ffff), new Integer(0x80001) + }).iterator()); + + // the same with no range splitting + assertIntRangeSplit(-5000, 9500, 32, true, Arrays.asList(new Integer[]{ + new Integer(0x7fffec78),new Integer(0x8000251c) + }).iterator()); + + // this tests optimized range splitting, if one of the inner bounds + // is also the bound of the next lower precision, it should be used completely + assertIntRangeSplit(0, 1024+63, 4, true, Arrays.asList(new Integer[]{ + new Integer(0x8000040), new Integer(0x8000043), + new Integer(0x800000), new Integer(0x800003) + }).iterator()); + + // the full int range should only consist of a lowest precision range; no bitset testing here, as too much memory needed :-) + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 8, false, Arrays.asList(new Integer[]{ + new Integer(0x00),new Integer(0xff) + }).iterator()); + + // the same with precisionStep=4 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 4, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0xf) + }).iterator()); + + // the same with precisionStep=2 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 2, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0x3) + }).iterator()); + + // the same with precisionStep=1 + assertIntRangeSplit(Integer.MIN_VALUE, Integer.MAX_VALUE, 1, false, Arrays.asList(new Integer[]{ + new Integer(0x0),new Integer(0x1) + }).iterator()); + + // a inverse range should produce no sub-ranges + assertIntRangeSplit(9500, -5000, 4, false, Collections.EMPTY_LIST.iterator()); + + // a 0-length range should reproduce the range itsself + assertIntRangeSplit(9500, 9500, 4, false, Arrays.asList(new Integer[]{ + new Integer(0x8000251c),new Integer(0x8000251c) + }).iterator()); + } + +} Property changes on: src\test\org\apache\lucene\util\TestTrieUtils.java ___________________________________________________________________ Added: svn:eol-style + native