Index: src/java/org/apache/lucene/index/ByteBlockPool.java
===================================================================
--- src/java/org/apache/lucene/index/ByteBlockPool.java (revision 1000834)
+++ src/java/org/apache/lucene/index/ByteBlockPool.java (working copy)
@@ -39,12 +39,12 @@
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
import org.apache.lucene.util.ArrayUtil;
-final class ByteBlockPool {
+public final class ByteBlockPool {
- abstract static class Allocator {
- abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
- abstract void recycleByteBlocks(List blocks);
- abstract byte[] getByteBlock();
+ public abstract static class Allocator {
+ public abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
+ public abstract void recycleByteBlocks(List blocks);
+ public abstract byte[] getByteBlock();
}
public byte[][] buffers = new byte[10][];
Index: src/java/org/apache/lucene/index/DocumentsWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 1000834)
+++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy)
@@ -1315,7 +1315,7 @@
/* Allocate another byte[] from the shared pool */
@Override
- byte[] getByteBlock() {
+ public byte[] getByteBlock() {
synchronized(DocumentsWriter.this) {
final int size = freeByteBlocks.size();
final byte[] b;
@@ -1331,7 +1331,7 @@
/* Return byte[]'s to the pool */
@Override
- void recycleByteBlocks(byte[][] blocks, int start, int end) {
+ public void recycleByteBlocks(byte[][] blocks, int start, int end) {
synchronized(DocumentsWriter.this) {
for(int i=start;i blocks) {
+ public void recycleByteBlocks(List blocks) {
synchronized(DocumentsWriter.this) {
final int size = blocks.size();
for(int i=0;i, Externalizable {
+ static final int HASH_PRIME = 31;
public static final byte[] EMPTY_BYTES = new byte[0];
/** The contents of the BytesRef. Should never be {@code null}. */
@@ -182,11 +183,10 @@
*/
@Override
public int hashCode() {
- final int prime = 31;
int result = 0;
final int end = offset + length;
for(int i=offset;i
+ * Note: {@link BytesRef} instance passed to {@link #add(BytesRef)} must not be
+ * longer than {@link #BYTES_BLOCK_SIZE}-2 in length.
+ *
+ *
+ * @lucene.internal
+ */
+
+public final class BytesRefHash {
+
+ public final static int BYTES_BLOCK_SHIFT = 15;
+ public final static int BYTES_BLOCK_SIZE = 1 << BYTES_BLOCK_SHIFT;
+ public final static int BYTES_BLOCK_MASK = BYTES_BLOCK_SIZE - 1;
+
+ public static final class ByteBlockAllocator extends ByteBlockPool.Allocator {
+ private final LinkedList availableBlocks = new LinkedList();
+ private final int maxBufferedBlocks;
+ private int blockUsedCount;
+
+ public ByteBlockAllocator(int maxBufferedBytes) {
+ maxBufferedBlocks = maxBufferedBytes >> BYTES_BLOCK_SHIFT;
+ }
+
+ public ByteBlockAllocator() {
+ this(0);
+ }
+
+ @Override
+ public byte[] getByteBlock() {
+ final byte[] block;
+ if (!availableBlocks.isEmpty()) {
+ block = availableBlocks.removeFirst();
+ } else {
+ block = new byte[BYTES_BLOCK_SIZE];
+ }
+ blockUsedCount++;
+ return block;
+ }
+
+ @Override
+ public void recycleByteBlocks(byte[][] blocks, int start, int end) {
+ final int numBlocks = end - start;
+ final int stop = start
+ + Math.min(maxBufferedBlocks - availableBlocks.size(), numBlocks);
+ for (int i = start; i < stop; i++) {
+ availableBlocks.add(blocks[i]);
+ }
+ blockUsedCount -= numBlocks;
+ }
+
+ public long ramBytesUsed() {
+ return (blockUsedCount + availableBlocks.size()) * BYTES_BLOCK_SIZE;
+ }
+
+ @Override
+ public void recycleByteBlocks(List blocks) {
+ final int numBlocks = blocks.size();
+ final int stop = Math.min(maxBufferedBlocks - availableBlocks.size(),
+ numBlocks);
+ int i = 0;
+ for (byte[] bs : blocks) {
+ availableBlocks.add(bs);
+ if (i++ >= stop)
+ break;
+ }
+ blockUsedCount -= numBlocks;
+ }
+ }
+
+ private final ByteBlockPool pool;
+
+ private int hashSize;
+ private int hashHalfSize;
+ private int hashMask;
+ private int count;
+ private int lastCount = -1;
+ private final ByteBlockAllocator allocator;
+ private int[] hash;
+ private int[] ords;
+
+ public BytesRefHash(ByteBlockAllocator allocator) {
+ this(allocator, 16);
+ }
+
+ /**
+ * Creates a new {@link BytesRefHash}
+ *
+ */
+ public BytesRefHash(ByteBlockAllocator allocator, int capacity) {
+ this.allocator = allocator;
+ hashSize = capacity;
+ hashHalfSize = hashSize >> 1;
+ hashMask = hashSize - 1;
+ pool = new ByteBlockPool(allocator);
+ hash = new int[hashSize];
+ Arrays.fill(hash, -1);
+ ords = new int[ArrayUtil
+ .oversize(hashSize, RamUsageEstimator.NUM_BYTES_INT)];
+ }
+
+ /**
+ * Returns the number of {@link BytesRef} in this hash structure.
+ *
+ * @return the number of {@link BytesRef} in this hash structure.
+ */
+ public int size() {
+ return count;
+ }
+
+ /**
+ * Returns the {@link BytesRef} value for the given ord.
+ *
+ * Note: the given ord must be a positive integer less that the current size (
+ * {@link #size()})
+ *
+ *
+ * @param ord
+ * the entries ordinal
+ *
+ * @return a BytesRef instance for the given ordinal
+ */
+ public BytesRef get(int ord) {
+ return deref(ords[ord], scratch1);
+ }
+
+ /**
+ * Returns the ordinal array in arbitrary order. Valid ordinal start at offset
+ * of 0 and end at a limit of {@link #size()} - 1
+ *
+ * Note: This is a destructive operation. Subsequent usage of this
+ * {@link BytesRefHash} instance yields undefined behavior
+ *
+ */
+ public int[] compact() {
+ int upto = 0;
+ for (int i = 0; i < hashSize; i++) {
+ if (hash[i] != -1) {
+ if (upto < i) {
+ hash[upto] = hash[i];
+ hash[i] = -1;
+ }
+ upto++;
+ }
+ }
+
+ assert upto == count;
+ lastCount = count;
+ return hash;
+ }
+
+ /**
+ * Returns the ordinal array sorted by the referenced byte values.
+ *
+ * @param comp
+ * the {@link Comparator} used for sorting
+ *
+ * Note: This is a destructive operation. Subsequent usage of this
+ * {@link BytesRefHash} instance yields undefined behavior
+ *
+ */
+ public int[] sort(Comparator comp) {
+ compact();
+ quickSort(comp, hash, 0, count - 1);
+ return hash;
+ }
+
+ public static class HashEntryIterator {
+ public int ord = 0;
+ public final BytesRef bytes;
+ private final int count;
+ private final int[] hashes;
+ private final int[] ords;
+ private final BytesRefHash hash;
+
+ private int pos = 0;
+
+ public HashEntryIterator(BytesRef bytes, BytesRefHash hash,
+ Comparator comp) {
+ this.bytes = bytes;
+ this.count = hash.count;
+ this.hashes = hash.sort(comp);
+ this.ords = hash.ords;
+ this.hash = hash;
+ }
+
+ public boolean next() {
+ if (pos < count) {
+ ord = hashes[pos++];
+ hash.deref(ords[ord], bytes);
+ return true;
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Returns a {@link HashEntryIterator} iterating all entries in sorted order.
+ *
+ * Note: This is a destructive operation. Subsequent usage of this
+ * {@link BytesRefHash} instance yields undefined behavior
+ *
+ * * @param bytes the {@link BytesRef} used to hold the current bytes value in
+ * the iterator
+ *
+ * @param comp
+ * the {@link Comparator} used for sorting
+ */
+ public HashEntryIterator sortedEntries(final BytesRef bytes,
+ Comparator comp) {
+ return new HashEntryIterator(bytes, this, comp);
+ }
+
+ private void quickSort(Comparator comp, int[] entries, int lo,
+ int hi) {
+ if (lo >= hi)
+ return;
+ if (hi == 1 + lo) {
+ if (compare(comp, entries[lo], entries[hi]) > 0) {
+ final int tmp = entries[lo];
+ entries[lo] = entries[hi];
+ entries[hi] = tmp;
+ }
+ return;
+ }
+ final int mid = (lo + hi) >>> 1;
+ if (compare(comp, entries[lo], entries[mid]) > 0) {
+ int tmp = entries[lo];
+ entries[lo] = entries[mid];
+ entries[mid] = tmp;
+ }
+
+ if (compare(comp, entries[mid], entries[hi]) > 0) {
+ int tmp = entries[mid];
+ entries[mid] = entries[hi];
+ entries[hi] = tmp;
+
+ if (compare(comp, entries[lo], entries[mid]) > 0) {
+ int tmp2 = entries[lo];
+ entries[lo] = entries[mid];
+ entries[mid] = tmp2;
+ }
+ }
+ int left = lo + 1;
+ int right = hi - 1;
+
+ if (left >= right)
+ return;
+
+ final int partition = entries[mid];
+
+ for (;;) {
+ while (compare(comp, entries[right], partition) > 0)
+ --right;
+
+ while (left < right && compare(comp, entries[left], partition) <= 0)
+ ++left;
+
+ if (left < right) {
+ final int tmp = entries[left];
+ entries[left] = entries[right];
+ entries[right] = tmp;
+ --right;
+ } else {
+ break;
+ }
+ }
+
+ quickSort(comp, entries, lo, left);
+ quickSort(comp, entries, left + 1, hi);
+ }
+
+ private final BytesRef scratch1 = new BytesRef();
+ private final BytesRef scratch2 = new BytesRef();
+
+ private final BytesRef deref(int bytesStart, BytesRef b) {
+ b.bytes = pool.buffers[bytesStart >> BYTES_BLOCK_SHIFT];
+ int pos = bytesStart & BYTES_BLOCK_MASK;
+
+ if ((b.bytes[pos] & 0x80) == 0) {
+ // length is 1 byte
+ b.length = b.bytes[pos];
+ pos += 1;
+ } else {
+ // length is 2 bytes
+ b.length = (b.bytes[pos] & 0x7f) + ((b.bytes[pos + 1] & 0xff) << 7);
+ pos += 2;
+ }
+ b.offset = pos;
+ return b;
+ }
+
+ private boolean equals(int e, BytesRef b) {
+ return deref(ords[e], scratch1).bytesEquals(b);
+ }
+
+ private int compare(Comparator comp, int e1, int e2) {
+ return comp.compare(deref(ords[e1], scratch1), deref(ords[e2], scratch2));
+ }
+
+ private boolean shrink(int targetSize) {
+
+ // Cannot use ArrayUtil.shrink because we require power
+ // of 2:
+ int newSize = hashSize;
+ while (newSize >= 8 && newSize / 4 > targetSize) {
+ newSize /= 2;
+ }
+ if (newSize != hashSize) {
+ hashSize = newSize;
+ hash = new int[hashSize];
+ Arrays.fill(hash, -1);
+ hashHalfSize = newSize / 2;
+ hashMask = newSize - 1;
+ ArrayUtil.shrink(ords, newSize);
+ Arrays.fill(ords, -1);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Clears the {@link BytesRef} and returns an {@link Entry} which maps to the
+ * given {@link BytesRef}
+ */
+ public void clear() {
+ lastCount = count;
+ count = 0;
+ pool.reset();
+ if (lastCount != -1 && shrink(lastCount)) {
+ // shrink clears the hash entries
+ return;
+ }
+ Arrays.fill(hash, -1);
+ Arrays.fill(ords, -1);
+ }
+
+ /**
+ * Adds a new {@link BytesRef}
+ *
+ * @param bytes
+ * the bytes to hash
+ * @return the ord of the hashed bytes
+ */
+ public int add(BytesRef bytes) {
+ return add(bytes, bytes.hashCode());
+ }
+
+ /**
+ * Adds a new {@link BytesRef} with a pre-calculated hash code.
+ *
+ * @param bytes
+ * the bytes to hash
+ * @param code
+ * the bytes hash code
+ *
+ *
+ * Hashcode is defined as:
+ *
+ *
+ * int hash = 0;
+ * for (int i = offset; i < offset + length; i++) {
+ * hash = 31 * hash + bytes[i];
+ * }
+ *
+ *
+ * @return the ord of the hashed bytes
+ */
+ public int add(BytesRef bytes, int code) {
+ final int length = bytes.length;
+ // final position
+ int hashPos = code & hashMask;
+ int e = hash[hashPos];
+ if (e != -1 && !equals(e, bytes)) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & hashMask;
+ e = hash[hashPos];
+ } while (e != -1 && !equals(e, bytes));
+ }
+
+ if (e == -1) {
+ // new entry
+ final int len2 = 2 + bytes.length;
+ if (len2 + pool.byteUpto > BYTES_BLOCK_SIZE) {
+ if (len2 > BYTES_BLOCK_SIZE) {
+ throw new IllegalArgumentException("bytes can be at most "
+ + (BYTES_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
+ }
+ pool.nextBuffer();
+ }
+ final byte[] buffer = pool.buffer;
+ final int bufferUpto = pool.byteUpto;
+ e = count++;
+ ords[e] = bufferUpto + pool.byteOffset;
+
+ // We first encode the length, followed by the
+ // bytes. Length is encoded as vInt, but will consume
+ // 1 or 2 bytes at most (we reject too-long terms,
+ // above).
+ if (length < 128) {
+ // 1 byte to store length
+ buffer[bufferUpto] = (byte) length;
+ pool.byteUpto += length + 1;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
+ length);
+ } else {
+ // 2 byte to store length
+ buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
+ buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
+ pool.byteUpto += length + 2;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
+ length);
+ }
+ assert hash[hashPos] == -1;
+ hash[hashPos] = e;
+
+ if (count == hashHalfSize) {
+ rehash(2 * hashSize);
+ }
+ }
+ return e;
+ }
+
+ /**
+ * Called when postings hash is too small (> 50% occupied) or too large (< 20%
+ * occupied).
+ */
+ private void rehash(final int newSize) {
+ final int newMask = newSize - 1;
+ final int[] newHash = new int[newSize];
+ ords = ArrayUtil.grow(ords, newSize);
+ Arrays.fill(newHash, -1);
+ for (int i = 0; i < hashSize; i++) {
+ final int e0 = hash[i];
+ if (e0 != -1) {
+ int code;
+ final int off = ords[e0];
+ final int start = off & BYTES_BLOCK_MASK;
+ final byte[] bytes = pool.buffers[off >> BYTES_BLOCK_SHIFT];
+ code = 0;
+ final int len;
+ int pos;
+ if ((bytes[start] & 0x80) == 0) {
+ // length is 1 byte
+ len = bytes[start];
+ pos = start + 1;
+ } else {
+ len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
+ pos = start + 2;
+ }
+
+ final int endPos = pos + len;
+ while (pos < endPos) {
+ code = BytesRef.HASH_PRIME * code + bytes[pos++];
+ }
+
+ int hashPos = code & newMask;
+ assert hashPos >= 0;
+ if (newHash[hashPos] != -1) {
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & newMask;
+ } while (newHash[hashPos] != -1);
+ }
+ newHash[hashPos] = e0;
+ }
+ }
+
+ hashMask = newMask;
+ hash = newHash;
+ hashSize = newSize;
+ hashHalfSize = newSize / 2;
+ }
+
+ public long ramBytesUsed() {
+ return allocator.ramBytesUsed()
+ + RamUsageEstimator.NUM_BYTES_OBJ_REF
+ * hashSize
+ + count
+ * (RamUsageEstimator.NUM_BYTES_OBJ_HEADER + RamUsageEstimator.NUM_BYTES_INT * 2);
+ }
+}
Property changes on: src/java/org/apache/lucene/util/BytesRefHash.java
___________________________________________________________________
Added: svn:eol-style
+ native
Added: svn:keywords
+ Date Author Id Revision HeadURL
Index: src/test/org/apache/lucene/index/TestByteSlices.java
===================================================================
--- src/test/org/apache/lucene/index/TestByteSlices.java (revision 1000834)
+++ src/test/org/apache/lucene/index/TestByteSlices.java (working copy)
@@ -25,7 +25,7 @@
/* Allocate another byte[] from the shared pool */
@Override
- synchronized byte[] getByteBlock() {
+ public synchronized byte[] getByteBlock() {
final int size = freeByteBlocks.size();
final byte[] b;
if (0 == size)
@@ -37,13 +37,13 @@
/* Return a byte[] to the pool */
@Override
- synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
+ public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
for(int i=start;i blocks) {
+ public synchronized void recycleByteBlocks(List blocks) {
final int size = blocks.size();
for(int i=0;i strings = new HashMap();
+ for (int i = 0; i < 797; i++) {
+ String str;
+ do {
+ str = _TestUtil.randomRealisticUnicodeString(random, 1000);
+ } while (str.length() == 0);
+ ref.copy(str);
+ int count = hash.size();
+ int ord = hash.add(ref);
+ if (strings.put(str, Integer.valueOf(ord)) == null) {
+ assertEquals(i, ord);
+ assertEquals(hash.size(), count + 1);
+ } else {
+ assertTrue(ord < count);
+ assertEquals(hash.size(), count);
+ }
+ }
+ for (Entry entry : strings.entrySet()) {
+ ref.copy(entry.getKey());
+ assertEquals(ref, hash.get(entry.getValue().intValue()));
+ }
+ hash.clear();
+ assertEquals(0, hash.size());
+ }
+ }
+
+ /**
+ * Test method for {@link org.apache.lucene.util.BytesRefHash#compact()}.
+ */
+ @Test
+ public void testCompact() {
+ BytesRef ref = new BytesRef();
+ for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) {
+ final int size = 797;
+ BitSet bits = new BitSet(size);
+ for (int i = 0; i < size; i++) {
+ String str;
+ do {
+ str = _TestUtil.randomRealisticUnicodeString(random, 1000);
+ } while (str.length() == 0);
+ ref.copy(str);
+ bits.set(hash.add(ref));
+
+ }
+ assertEquals(hash.size(), bits.cardinality());
+ int[] compact = hash.compact();
+ assertTrue(size < compact.length);
+ for (int i = 0; i < size; i++) {
+ bits.set(compact[i], false);
+ }
+ assertEquals(0, bits.cardinality());
+ hash.clear();
+ assertEquals(0, hash.size());
+ }
+ }
+
+ /**
+ * Test method for
+ * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}.
+ */
+ @Test
+ public void testSort() {
+ BytesRef ref = new BytesRef();
+ for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) {
+ SortedSet strings = new TreeSet();
+ for (int i = 0; i < 797; i++) {
+ String str;
+ do {
+ str = _TestUtil.randomRealisticUnicodeString(random, 1000);
+ } while (str.length() == 0);
+ ref.copy(str);
+ hash.add(ref);
+ strings.add(str);
+ }
+ int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
+ assertTrue(strings.size() < sort.length);
+ int i = 0;
+ for (String string : strings) {
+ ref.copy(string);
+ assertEquals(ref, hash.get(sort[i++]));
+ }
+ hash.clear();
+ assertEquals(0, hash.size());
+ }
+ }
+
+ /**
+ * Test method for
+ * {@link org.apache.lucene.util.BytesRefHash#add(org.apache.lucene.util.BytesRef)}
+ * .
+ */
+ @Test
+ public void testAdd() {
+ BytesRef ref = new BytesRef();
+ for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) {
+ Set strings = new HashSet();
+ for (int i = 0; i < 797; i++) {
+ String str;
+ do {
+ str = _TestUtil.randomRealisticUnicodeString(random, 1000);
+ } while (str.length() == 0);
+ ref.copy(str);
+ int count = hash.size();
+ int ord = hash.add(ref);
+
+ if (strings.add(str)) {
+ assertEquals(i, ord);
+ assertEquals(hash.size(), count + 1);
+ } else {
+ assertTrue(ord < count);
+ assertEquals(hash.size(), count);
+ }
+ }
+
+ int count = hash.size();
+ for (String string : strings) {
+ ref.copy(string);
+ int ord = hash.add(ref);
+ assertEquals(count, hash.size());
+ assertTrue("ord: " + ord + " count: " + count + " string: " + string,
+ ord < count);
+ }
+ hash.clear();
+ assertEquals(0, hash.size());
+ }
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testLargeValue() {
+ int[] sizes = new int[] { random.nextInt(5),
+ BytesRefHash.BYTES_BLOCK_SIZE - 33 + random.nextInt(31),
+ BytesRefHash.BYTES_BLOCK_SIZE - 1 + random.nextInt(37) };
+ BytesRef ref = new BytesRef();
+ for (int i = 0; i < sizes.length; i++) {
+ ref.bytes = new byte[sizes[i]];
+ ref.offset = 0;
+ ref.length = sizes[i];
+ try {
+ assertEquals(i, hash.add(ref));
+ } catch (IllegalArgumentException e) {
+ if (i < sizes.length - 1)
+ fail("unexpected exception at size: " + sizes[i]);
+ throw e;
+ }
+ }
+ }
+
+ @Test
+ public void testSortedIterator() {
+ BytesRef ref = new BytesRef();
+ for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) {
+ SortedSet strings = new TreeSet();
+ for (int i = 0; i < 797; i++) {
+ String str;
+ do {
+ str = _TestUtil.randomRealisticUnicodeString(random, 1000);
+ } while (str.length() == 0);
+ ref.copy(str);
+ hash.add(ref);
+ strings.add(str);
+ }
+ HashEntryIterator sortedEntries = hash.sortedEntries(new BytesRef(), BytesRef.getUTF8SortedAsUTF16Comparator());
+ for (String string : strings) {
+ ref.copy(string);
+ assertTrue(sortedEntries.next());
+ assertEquals(ref, sortedEntries.bytes);
+ }
+ assertFalse(sortedEntries.next());
+
+ hash.clear();
+ assertEquals(0, hash.size());
+ }
+ }
+
+}
Property changes on: src/test/org/apache/lucene/util/TestBytesRefHash.java
___________________________________________________________________
Added: svn:eol-style
+ native
Added: svn:keywords
+ Date Author Id Revision HeadURL