precisionStep
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
@@ -107,23 +194,15 @@
* before using set a value using the various set???Value() methods.
*/
public NumericTokenStream(final int precisionStep) {
- super();
- this.precisionStep = precisionStep;
- if (precisionStep < 1)
- throw new IllegalArgumentException("precisionStep must be >=1");
- }
+ super(new NumericAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
+ // we must do this after the super call :(
+ ((NumericAttributeFactory) getAttributeFactory()).ts = this;
+ addAttribute(NumericTermAttribute.class);
- /**
- * Expert: Creates a token stream for numeric values with the specified
- * precisionStep using the given {@link AttributeSource}.
- * The stream is not yet initialized,
- * before using set a value using the various set???Value() methods.
- */
- public NumericTokenStream(AttributeSource source, final int precisionStep) {
- super(source);
this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
+ shift = -precisionStep;
}
/**
@@ -134,10 +213,15 @@
* before using set a value using the various set???Value() methods.
*/
public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
- super(factory);
+ super(new NumericAttributeFactory(factory));
+ // we must do this after the super call :(
+ ((NumericAttributeFactory) getAttributeFactory()).ts = this;
+ addAttribute(NumericTermAttribute.class);
+
this.precisionStep = precisionStep;
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
+ shift = -precisionStep;
}
/**
@@ -149,7 +233,7 @@
public NumericTokenStream setLongValue(final long value) {
this.value = value;
valSize = 64;
- shift = 0;
+ shift = -precisionStep;
return this;
}
@@ -162,7 +246,7 @@
public NumericTokenStream setIntValue(final int value) {
this.value = value;
valSize = 32;
- shift = 0;
+ shift = -precisionStep;
return this;
}
@@ -175,7 +259,7 @@
public NumericTokenStream setDoubleValue(final double value) {
this.value = NumericUtils.doubleToSortableLong(value);
valSize = 64;
- shift = 0;
+ shift = -precisionStep;
return this;
}
@@ -188,7 +272,7 @@
public NumericTokenStream setFloatValue(final float value) {
this.value = NumericUtils.floatToSortableInt(value);
valSize = 32;
- shift = 0;
+ shift = -precisionStep;
return this;
}
@@ -196,37 +280,24 @@
public void reset() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
- shift = 0;
+ shift = -precisionStep;
}
@Override
public boolean incrementToken() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
- if (shift >= valSize)
+ shift += precisionStep;
+ if (shift >= valSize) {
+ // reset so the attribute still works after exhausted stream
+ shift -= precisionStep;
return false;
+ }
clearAttributes();
- final char[] buffer;
- switch (valSize) {
- case 64:
- buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
- termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
- break;
-
- case 32:
- buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
- termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
- break;
-
- default:
- // should not happen
- throw new IllegalArgumentException("valSize must be 32 or 64");
- }
-
+ // the TermToBytesRefAttribute is directly accessing shift & value.
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
- shift += precisionStep;
return true;
}
@@ -238,12 +309,11 @@
}
// members
- private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private int shift = 0, valSize = 0; // valSize==0 means not initialized
+ int shift, valSize = 0; // valSize==0 means not initialized
private final int precisionStep;
- private long value = 0L;
+ long value = 0L;
}
Index: src/java/org/apache/lucene/search/NumericRangeQuery.java
===================================================================
--- src/java/org/apache/lucene/search/NumericRangeQuery.java (revision 928371)
+++ src/java/org/apache/lucene/search/NumericRangeQuery.java (working copy)
@@ -379,9 +379,9 @@
*/
private final class NumericRangeTermsEnum extends FilteredTermsEnum {
- private final BytesRef currentLowerBound = new BytesRef(), currentUpperBound = new BytesRef();
+ private BytesRef currentLowerBound, currentUpperBound;
- private final LinkedListThis class generates terms to achieve this: First the numerical integer values need to
- * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
- * and the bits are converted to ASCII chars with each 7 bit. The resulting string is
- * sortable like the original integer value. Each value is also prefixed
- * (in the first char) by the shift value (number of bits removed) used
+ * be converted to bytes. For that integer values (32 bit or 64 bit) are made unsigned
+ * and the bits are converted to ASCII chars with each 7 bit. The resulting byte[] is
+ * sortable like the original integer value (even using UTF-8 sort order). Each value is also
+ * prefixed (in the first char) by the shift value (number of bits removed) used
* during encoding.
*
*
To also index floating point numbers, this class supplies two methods to convert them @@ -51,13 +51,13 @@ * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part * for the same data types. * - *
This class can also be used, to generate lexicographically sortable (according - * {@link String#compareTo(String)}) representations of numeric data types for other - * usages (e.g. sorting). + *
This class can also be used, to generate lexicographically sortable (according to
+ * {@link BytesRef#getUTF8SortedAsUTF16Comparator()}) representations of numeric data
+ * types for other usages (e.g. sorting).
*
* @lucene.experimental
*
- * @since 2.9
+ * @since 2.9, API changed non backwards-compliant in 3.1
*/
public final class NumericUtils {
@@ -71,27 +71,27 @@
/**
* Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
- * stored as SHIFT_START_LONG+shift in the first character
+ * stored as SHIFT_START_LONG+shift in the first byte
*/
- public static final char SHIFT_START_LONG = (char)0x20;
+ public static final byte SHIFT_START_LONG = 0x20;
/**
- * Expert: The maximum term length (used for char[] buffer size)
+ * Expert: The maximum term length (used for byte[] buffer size)
* for encoding long values.
- * @see #longToPrefixCoded(long,int,char[])
+ * @see #longToPrefixCoded(long,int,BytesRef)
*/
public static final int BUF_SIZE_LONG = 63/7 + 2;
/**
* Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
- * stored as SHIFT_START_INT+shift in the first character
+ * stored as SHIFT_START_INT+shift in the first byte
*/
- public static final char SHIFT_START_INT = (char)0x60;
+ public static final byte SHIFT_START_INT = 0x60;
/**
- * Expert: The maximum term length (used for char[] buffer size)
+ * Expert: The maximum term length (used for byte[] buffer size)
* for encoding int values.
- * @see #intToPrefixCoded(int,int,char[])
+ * @see #intToPrefixCoded(int,int,BytesRef)
*/
public static final int BUF_SIZE_INT = 31/7 + 2;
@@ -100,25 +100,33 @@
* This is method is used by {@link NumericTokenStream}.
* @param val the numeric value
* @param shift how many bits to strip from the right
- * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG}
- * length
- * @return number of chars written to buffer
+ * @param bytes will contain the encoded value
+ * @return the hash code for indexing (TermsHash)
*/
- public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
+ public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
- int nChars = (63-shift)/7 + 1, len = nChars+1;
- buffer[0] = (char)(SHIFT_START_LONG + shift);
+ if (bytes.bytes == null) {
+ bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG];
+ } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_LONG) {
+ bytes.grow(NumericUtils.BUF_SIZE_LONG);
+ }
+ int nChars = (63-shift)/7 + 1;
+ bytes.length = nChars+1;
+ int hash = (bytes.bytes[0] = (byte) (SHIFT_START_LONG + shift));
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
while (nChars>=1) {
- // Store 7 bits per character for good efficiency when UTF-8 encoding.
- // The whole number is right-justified so that lucene can prefix-encode
- // the terms more efficiently.
- buffer[nChars--] = (char)(sortableBits & 0x7f);
+ // Store 7 bits per byte for compatibility
+ // with UTF-8 encoding of terms
+ bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
- return len;
+ // TODO: optimize this to do it in above loop
+ for (int i = 1; i < bytes.length; i++) {
+ hash = 31*hash + bytes.bytes[i];
+ }
+ return hash;
}
/**
@@ -126,11 +134,13 @@
* This is method is used by {@link LongRangeBuilder}.
* @param val the numeric value
* @param shift how many bits to strip from the right
+ * @deprecated This method is no longer needed!
*/
+ @Deprecated
public static String longToPrefixCoded(final long val, final int shift) {
- final char[] buffer = new char[BUF_SIZE_LONG];
- final int len = longToPrefixCoded(val, shift, buffer);
- return new String(buffer, 0, len);
+ final BytesRef buffer = new BytesRef(BUF_SIZE_LONG);
+ longToPrefixCoded(val, shift, buffer);
+ return buffer.utf8ToString();
}
/**
@@ -138,7 +148,9 @@
* reducing the precision. It can be used to store the full precision value as a
* stored field in index.
*
To decode, use {@link #prefixCodedToLong}. + * @deprecated This method is no longer needed! */ + @Deprecated public static String longToPrefixCoded(final long val) { return longToPrefixCoded(val, 0); } @@ -148,25 +160,33 @@ * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT} - * length - * @return number of chars written to buffer + * @param bytes will contain the encoded value + * @return the hash code for indexing (TermsHash) */ - public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { + public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) { if (shift>31 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..31"); - int nChars = (31-shift)/7 + 1, len = nChars+1; - buffer[0] = (char)(SHIFT_START_INT + shift); + if (bytes.bytes == null) { + bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT]; + } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) { + bytes.grow(NumericUtils.BUF_SIZE_INT); + } + int nChars = (31-shift)/7 + 1; + bytes.length = nChars+1; + int hash = (bytes.bytes[0] = (byte)(SHIFT_START_INT + shift)); int sortableBits = val ^ 0x80000000; sortableBits >>>= shift; while (nChars>=1) { - // Store 7 bits per character for good efficiency when UTF-8 encoding. - // The whole number is right-justified so that lucene can prefix-encode - // the terms more efficiently. - buffer[nChars--] = (char)(sortableBits & 0x7f); + // Store 7 bits per byte for compatibility + // with UTF-8 encoding of terms + bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f); sortableBits >>>= 7; } - return len; + // TODO: optimize this to do it in above loop + for (int i = 1; i < bytes.length; i++) { + hash = 31*hash + bytes.bytes[i]; + } + return hash; } /** @@ -174,11 +194,13 @@ * This is method is used by {@link IntRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right + * @deprecated This method is no longer needed! */ + @Deprecated public static String intToPrefixCoded(final int val, final int shift) { - final char[] buffer = new char[BUF_SIZE_INT]; - final int len = intToPrefixCoded(val, shift, buffer); - return new String(buffer, 0, len); + final BytesRef buffer = new BytesRef(BUF_SIZE_INT); + intToPrefixCoded(val, shift, buffer); + return buffer.utf8ToString(); } /** @@ -186,7 +208,9 @@ * reducing the precision. It can be used to store the full precision value as a * stored field in index. *
To decode, use {@link #prefixCodedToInt}.
+ * @deprecated This method is no longer needed!
*/
+ @Deprecated
public static String intToPrefixCoded(final int val) {
return intToPrefixCoded(val, 0);
}
@@ -198,42 +222,36 @@
* @throws NumberFormatException if the supplied string is
* not correctly prefix encoded.
* @see #longToPrefixCoded(long)
+ * @deprecated This method is no longer needed!
*/
+ @Deprecated
public static long prefixCodedToLong(final String prefixCoded) {
- final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
- if (shift>63 || shift<0)
- throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
- long sortableBits = 0L;
- for (int i=1, len=prefixCoded.length(); i This method is used by {@link NumericRangeQuery}.
*/
@@ -454,8 +474,9 @@
/**
* Expert: Callback for {@link #splitLongRange}.
* You need to overwrite only one of the methods.
- * NOTE: This is a very low-level interface,
- * the method signatures may change in later versions.
+ * @lucene.experimental NOTE: This is a very low-level interface,
+ * the method signatures may change in later versions.
+ * @since 2.9, API changed non backwards-compliant in 3.1
*/
public static abstract class LongRangeBuilder {
@@ -463,7 +484,7 @@
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical (inclusive) range queries from them.
*/
- public void addRange(String minPrefixCoded, String maxPrefixCoded) {
+ public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
throw new UnsupportedOperationException();
}
@@ -472,7 +493,10 @@
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final long min, final long max, final int shift) {
- addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
+ final BytesRef minBytes = new BytesRef(BUF_SIZE_LONG), maxBytes = new BytesRef(BUF_SIZE_LONG);
+ longToPrefixCoded(min, shift, minBytes);
+ longToPrefixCoded(max, shift, maxBytes);
+ addRange(minBytes, maxBytes);
}
}
@@ -480,8 +504,9 @@
/**
* Expert: Callback for {@link #splitIntRange}.
* You need to overwrite only one of the methods.
- * NOTE: This is a very low-level interface,
- * the method signatures may change in later versions.
+ * @lucene.experimental NOTE: This is a very low-level interface,
+ * the method signatures may change in later versions.
+ * @since 2.9, API changed non backwards-compliant in 3.1
*/
public static abstract class IntRangeBuilder {
@@ -489,7 +514,7 @@
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical range (inclusive) queries from them.
*/
- public void addRange(String minPrefixCoded, String maxPrefixCoded) {
+ public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
throw new UnsupportedOperationException();
}
@@ -498,7 +523,10 @@
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final int min, final int max, final int shift) {
- addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));
+ final BytesRef minBytes = new BytesRef(BUF_SIZE_INT), maxBytes = new BytesRef(BUF_SIZE_INT);
+ intToPrefixCoded(min, shift, minBytes);
+ intToPrefixCoded(max, shift, maxBytes);
+ addRange(minBytes, maxBytes);
}
}
Index: src/test/org/apache/lucene/analysis/TestNumericTokenStream.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 928371)
+++ src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (working copy)
@@ -17,8 +17,9 @@
* limitations under the License.
*/
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestNumericTokenStream extends BaseTokenStreamTestCase {
@@ -29,27 +30,47 @@
public void testLongStream() throws Exception {
final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue);
// use getAttribute to test if attributes really exist, if not an IAE will be throwed
- final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
+ final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
+ final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
+ final BytesRef bytes = new BytesRef();
+ stream.reset();
+ assertEquals(64, numericAtt.getValueSize());
+ assertEquals(lvalue, numericAtt.getRawValue());
for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
assertTrue("New token is available", stream.incrementToken());
- assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term());
- assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
+ assertEquals("Shift value wrong", shift, numericAtt.getShift());
+ final int hash = bytesAtt.toBytesRef(bytes);
+ assertEquals("Hash incorrect", bytes.hashCode(), hash);
+ assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), NumericUtils.prefixCodedToLong(bytes));
+ assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
}
- assertFalse("No more tokens available", stream.incrementToken());
+ assertFalse("More tokens available", stream.incrementToken());
+ stream.end();
+ stream.close();
}
public void testIntStream() throws Exception {
final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue);
// use getAttribute to test if attributes really exist, if not an IAE will be throwed
- final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
+ final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
+ final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
+ final BytesRef bytes = new BytesRef();
+ stream.reset();
+ assertEquals(32, numericAtt.getValueSize());
+ assertEquals(ivalue, numericAtt.getRawValue());
for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
assertTrue("New token is available", stream.incrementToken());
- assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term());
- assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
+ assertEquals("Shift value wrong", shift, numericAtt.getShift());
+ final int hash = bytesAtt.toBytesRef(bytes);
+ assertEquals("Hash incorrect", bytes.hashCode(), hash);
+ assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes));
+ assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
}
- assertFalse("No more tokens available", stream.incrementToken());
+ assertFalse("More tokens available", stream.incrementToken());
+ stream.end();
+ stream.close();
}
public void testNotInitialized() throws Exception {