Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 940481) +++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (working copy) @@ -126,10 +126,6 @@ // *** TermToBytesRefAttribute interface *** public final int toBytesRef(BytesRef target) { - // TODO: Maybe require that bytes is already initialized? TermsHashPerField ensures this. - if (target.bytes == null) { - target.bytes = new byte[termLength * 4]; - } return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); } Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java (revision 940481) +++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java (working copy) @@ -32,6 +32,8 @@ public interface TermToBytesRefAttribute extends Attribute { /** Copies the token's term text into the given {@link BytesRef}. * @param termBytes destination to write the bytes to (UTF-8 for text terms). + * The length of the BytesRef's buffer may be not large enough, so you need to grow. + * The parameters' {@code bytes} is guaranteed to be not {@code null}. * @return the hashcode as defined by {@link BytesRef#hashCode}: *
* int hash = 0;
Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 940481)
+++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy)
@@ -24,22 +24,33 @@
* existing byte[].
*
* @lucene.experimental */
-public final class BytesRef {
+public final class BytesRef implements Comparable {
+ public static final byte[] EMPTY_BYTES = new byte[0];
+ /** The contents of the BytesRef. Should never be {@code null}. */
public byte[] bytes;
+
+ /** Offset of first valid byte. */
public int offset;
+
+ /** Length of used bytes. */
public int length;
public BytesRef() {
+ bytes = EMPTY_BYTES;
}
+ /** bytes[] should not be null */
public BytesRef(byte[] bytes, int offset, int length) {
+ assert bytes != null;
this.bytes = bytes;
this.offset = offset;
this.length = length;
}
+ /** bytes[] should not be null */
public BytesRef(byte[] bytes) {
+ assert bytes != null;
this.bytes = bytes;
this.offset = 0;
this.length = bytes.length;
@@ -55,10 +66,12 @@
* unicode text, with no unpaired surrogates or U+FFFF.
*/
public BytesRef(CharSequence text) {
+ this();
copy(text);
}
public BytesRef(BytesRef other) {
+ this();
copy(other);
}
@@ -69,13 +82,6 @@
* unpaired surrogates or invalid UTF16 code units.
*/
public void copy(CharSequence text) {
- // TODO: new byte[10] is waste of resources; it should
- // simply allocate text.length()*4 like UnicodeUtil.
- // Ideally, I would remove this here and add a
- // null-check in UnicodeUtil. (Uwe)
- if (bytes == null) {
- bytes = new byte[10];
- }
UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
}
@@ -178,10 +184,8 @@
}
public void copy(BytesRef other) {
- if (bytes == null) {
+ if (bytes.length < other.length) {
bytes = new byte[other.length];
- } else {
- bytes = ArrayUtil.grow(bytes, other.length);
}
System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
length = other.length;
@@ -198,6 +202,68 @@
return utf8SortedAsUTF16SortOrder;
}
+ /** Unsigned byte order comparison */
+ /*
+ public int compareTo(BytesRef other) {
+ if (this == other) return 0;
+
+ final byte[] aBytes = this.bytes;
+ int aUpto = this.offset;
+ final byte[] bBytes = other.bytes;
+ int bUpto = other.offset;
+
+ final int aStop = aUpto + Math.min(this.length, other.length);
+
+ while(aUpto < aStop) {
+ int aByte = aBytes[aUpto++] & 0xff;
+ int bByte = bBytes[bUpto++] & 0xff;
+ int diff = aByte - bByte;
+ if (diff != 0) return diff;
+ }
+
+ // One is a prefix of the other, or, they are equal:
+ return this.length - other.length;
+ }
+ */
+
+ /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
+ * in the future to unsigned byte comparison. */
+ public int compareTo(BytesRef other) {
+ if (this == other) return 0;
+
+ final byte[] aBytes = this.bytes;
+ int aUpto = this.offset;
+ final byte[] bBytes = other.bytes;
+ int bUpto = other.offset;
+
+ final int aStop = aUpto + Math.min(this.length, other.length);
+
+ while(aUpto < aStop) {
+ int aByte = aBytes[aUpto++] & 0xff;
+ int bByte = bBytes[bUpto++] & 0xff;
+ if (aByte != bByte) {
+
+ // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
+
+ // We know the terms are not equal, but, we may
+ // have to carefully fixup the bytes at the
+ // difference to match UTF16's sort order:
+ if (aByte >= 0xee && bByte >= 0xee) {
+ if ((aByte & 0xfe) == 0xee) {
+ aByte += 0x10;
+ }
+ if ((bByte&0xfe) == 0xee) {
+ bByte += 0x10;
+ }
+ }
+ return aByte - bByte;
+ }
+ }
+
+ // One is a prefix of the other, or, they are equal:
+ return this.length - other.length;
+ }
+
private static class UTF8SortedAsUTF16Comparator implements Comparator {
// Only singleton
private UTF8SortedAsUTF16Comparator() {};
Index: lucene/src/java/org/apache/lucene/util/NumericUtils.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/NumericUtils.java (revision 940481)
+++ lucene/src/java/org/apache/lucene/util/NumericUtils.java (working copy)
@@ -107,13 +107,11 @@
public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
- if (bytes.bytes == null) {
- bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG];
- } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_LONG) {
- bytes.grow(NumericUtils.BUF_SIZE_LONG);
- }
int hash, nChars = (63-shift)/7 + 1;
bytes.length = nChars+1;
+ if (bytes.bytes.length < bytes.length) {
+ bytes.grow(NumericUtils.BUF_SIZE_LONG);
+ }
bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift));
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
@@ -167,13 +165,11 @@
public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) {
if (shift>31 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
- if (bytes.bytes == null) {
- bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT];
- } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) {
- bytes.grow(NumericUtils.BUF_SIZE_INT);
- }
int hash, nChars = (31-shift)/7 + 1;
bytes.length = nChars+1;
+ if (bytes.bytes.length < bytes.length) {
+ bytes.grow(NumericUtils.BUF_SIZE_INT);
+ }
bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift));
int sortableBits = val ^ 0x80000000;
sortableBits >>>= shift;