Index: src/java/org/apache/lucene/index/ByteBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/ByteBlockPool.java (revision 1000834) +++ src/java/org/apache/lucene/index/ByteBlockPool.java (working copy) @@ -39,12 +39,12 @@ import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; import org.apache.lucene.util.ArrayUtil; -final class ByteBlockPool { +public final class ByteBlockPool { - abstract static class Allocator { - abstract void recycleByteBlocks(byte[][] blocks, int start, int end); - abstract void recycleByteBlocks(List blocks); - abstract byte[] getByteBlock(); + public abstract static class Allocator { + public abstract void recycleByteBlocks(byte[][] blocks, int start, int end); + public abstract void recycleByteBlocks(List blocks); + public abstract byte[] getByteBlock(); } public byte[][] buffers = new byte[10][]; Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 1000834) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1315,7 +1315,7 @@ /* Allocate another byte[] from the shared pool */ @Override - byte[] getByteBlock() { + public byte[] getByteBlock() { synchronized(DocumentsWriter.this) { final int size = freeByteBlocks.size(); final byte[] b; @@ -1331,7 +1331,7 @@ /* Return byte[]'s to the pool */ @Override - void recycleByteBlocks(byte[][] blocks, int start, int end) { + public void recycleByteBlocks(byte[][] blocks, int start, int end) { synchronized(DocumentsWriter.this) { for(int i=start;i blocks) { + public void recycleByteBlocks(List blocks) { synchronized(DocumentsWriter.this) { final int size = blocks.size(); for(int i=0;i, Externalizable { + static final int HASH_PRIME = 31; public static final byte[] EMPTY_BYTES = new byte[0]; /** The contents of the BytesRef. Should never be {@code null}. */ @@ -182,11 +183,10 @@ */ @Override public int hashCode() { - final int prime = 31; int result = 0; final int end = offset + length; for(int i=offset;i + * Note: {@link BytesRef} instance passed to {@link #add(BytesRef)} must not be + * longer than {@link #BYTES_BLOCK_SIZE}-2 in length. + *

+ * + * @lucene.internal + */ + +public final class BytesRefHash { + + public final static int BYTES_BLOCK_SHIFT = 15; + public final static int BYTES_BLOCK_SIZE = 1 << BYTES_BLOCK_SHIFT; + public final static int BYTES_BLOCK_MASK = BYTES_BLOCK_SIZE - 1; + + public static final class ByteBlockAllocator extends ByteBlockPool.Allocator { + private final LinkedList availableBlocks = new LinkedList(); + private final int maxBufferedBlocks; + private int blockUsedCount; + + public ByteBlockAllocator(int maxBufferedBytes) { + maxBufferedBlocks = maxBufferedBytes >> BYTES_BLOCK_SHIFT; + } + + public ByteBlockAllocator() { + this(0); + } + + @Override + public byte[] getByteBlock() { + final byte[] block; + if (!availableBlocks.isEmpty()) { + block = availableBlocks.removeFirst(); + } else { + block = new byte[BYTES_BLOCK_SIZE]; + } + blockUsedCount++; + return block; + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + final int numBlocks = end - start; + final int stop = start + + Math.min(maxBufferedBlocks - availableBlocks.size(), numBlocks); + for (int i = start; i < stop; i++) { + availableBlocks.add(blocks[i]); + } + blockUsedCount -= numBlocks; + } + + public long ramBytesUsed() { + return (blockUsedCount + availableBlocks.size()) * BYTES_BLOCK_SIZE; + } + + @Override + public void recycleByteBlocks(List blocks) { + final int numBlocks = blocks.size(); + final int stop = Math.min(maxBufferedBlocks - availableBlocks.size(), + numBlocks); + int i = 0; + for (byte[] bs : blocks) { + availableBlocks.add(bs); + if (i++ >= stop) + break; + } + blockUsedCount -= numBlocks; + } + } + + private final ByteBlockPool pool; + + private int hashSize; + private int hashHalfSize; + private int hashMask; + private int count; + private int lastCount = -1; + private final ByteBlockAllocator allocator; + private int[] hash; + private int[] ords; + + public BytesRefHash(ByteBlockAllocator allocator) { + this(allocator, 16); + } + + /** + * Creates a new {@link BytesRefHash} + * + */ + public BytesRefHash(ByteBlockAllocator allocator, int capacity) { + this.allocator = allocator; + hashSize = capacity; + hashHalfSize = hashSize >> 1; + hashMask = hashSize - 1; + pool = new ByteBlockPool(allocator); + hash = new int[hashSize]; + Arrays.fill(hash, -1); + ords = new int[ArrayUtil + .oversize(hashSize, RamUsageEstimator.NUM_BYTES_INT)]; + } + + /** + * Returns the number of {@link BytesRef} in this hash structure. + * + * @return the number of {@link BytesRef} in this hash structure. + */ + public int size() { + return count; + } + + /** + * Returns the {@link BytesRef} value for the given ord. + *

+ * Note: the given ord must be a positive integer less that the current size ( + * {@link #size()}) + *

+ * + * @param ord + * the entries ordinal + * + * @return a BytesRef instance for the given ordinal + */ + public BytesRef get(int ord) { + return deref(ords[ord], scratch1); + } + + /** + * Returns the ordinal array in arbitrary order. Valid ordinal start at offset + * of 0 and end at a limit of {@link #size()} - 1 + *

+ * Note: This is a destructive operation. Subsequent usage of this + * {@link BytesRefHash} instance yields undefined behavior + *

+ */ + public int[] compact() { + int upto = 0; + for (int i = 0; i < hashSize; i++) { + if (hash[i] != -1) { + if (upto < i) { + hash[upto] = hash[i]; + hash[i] = -1; + } + upto++; + } + } + + assert upto == count; + lastCount = count; + return hash; + } + + /** + * Returns the ordinal array sorted by the referenced byte values. + * + * @param comp + * the {@link Comparator} used for sorting + *

+ * Note: This is a destructive operation. Subsequent usage of this + * {@link BytesRefHash} instance yields undefined behavior + *

+ */ + public int[] sort(Comparator comp) { + compact(); + quickSort(comp, hash, 0, count - 1); + return hash; + } + + public static class HashEntryIterator { + public int ord = 0; + public final BytesRef bytes; + private final int count; + private final int[] hashes; + private final int[] ords; + private final BytesRefHash hash; + + private int pos = 0; + + public HashEntryIterator(BytesRef bytes, BytesRefHash hash, + Comparator comp) { + this.bytes = bytes; + this.count = hash.count; + this.hashes = hash.sort(comp); + this.ords = hash.ords; + this.hash = hash; + } + + public boolean next() { + if (pos < count) { + ord = hashes[pos++]; + hash.deref(ords[ord], bytes); + return true; + } + return false; + } + } + + /** + * Returns a {@link HashEntryIterator} iterating all entries in sorted order. + *

+ * Note: This is a destructive operation. Subsequent usage of this + * {@link BytesRefHash} instance yields undefined behavior + *

+ * * @param bytes the {@link BytesRef} used to hold the current bytes value in + * the iterator + * + * @param comp + * the {@link Comparator} used for sorting + */ + public HashEntryIterator sortedEntries(final BytesRef bytes, + Comparator comp) { + return new HashEntryIterator(bytes, this, comp); + } + + private void quickSort(Comparator comp, int[] entries, int lo, + int hi) { + if (lo >= hi) + return; + if (hi == 1 + lo) { + if (compare(comp, entries[lo], entries[hi]) > 0) { + final int tmp = entries[lo]; + entries[lo] = entries[hi]; + entries[hi] = tmp; + } + return; + } + final int mid = (lo + hi) >>> 1; + if (compare(comp, entries[lo], entries[mid]) > 0) { + int tmp = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp; + } + + if (compare(comp, entries[mid], entries[hi]) > 0) { + int tmp = entries[mid]; + entries[mid] = entries[hi]; + entries[hi] = tmp; + + if (compare(comp, entries[lo], entries[mid]) > 0) { + int tmp2 = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp2; + } + } + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + final int partition = entries[mid]; + + for (;;) { + while (compare(comp, entries[right], partition) > 0) + --right; + + while (left < right && compare(comp, entries[left], partition) <= 0) + ++left; + + if (left < right) { + final int tmp = entries[left]; + entries[left] = entries[right]; + entries[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(comp, entries, lo, left); + quickSort(comp, entries, left + 1, hi); + } + + private final BytesRef scratch1 = new BytesRef(); + private final BytesRef scratch2 = new BytesRef(); + + private final BytesRef deref(int bytesStart, BytesRef b) { + b.bytes = pool.buffers[bytesStart >> BYTES_BLOCK_SHIFT]; + int pos = bytesStart & BYTES_BLOCK_MASK; + + if ((b.bytes[pos] & 0x80) == 0) { + // length is 1 byte + b.length = b.bytes[pos]; + pos += 1; + } else { + // length is 2 bytes + b.length = (b.bytes[pos] & 0x7f) + ((b.bytes[pos + 1] & 0xff) << 7); + pos += 2; + } + b.offset = pos; + return b; + } + + private boolean equals(int e, BytesRef b) { + return deref(ords[e], scratch1).bytesEquals(b); + } + + private int compare(Comparator comp, int e1, int e2) { + return comp.compare(deref(ords[e1], scratch1), deref(ords[e2], scratch2)); + } + + private boolean shrink(int targetSize) { + + // Cannot use ArrayUtil.shrink because we require power + // of 2: + int newSize = hashSize; + while (newSize >= 8 && newSize / 4 > targetSize) { + newSize /= 2; + } + if (newSize != hashSize) { + hashSize = newSize; + hash = new int[hashSize]; + Arrays.fill(hash, -1); + hashHalfSize = newSize / 2; + hashMask = newSize - 1; + ArrayUtil.shrink(ords, newSize); + Arrays.fill(ords, -1); + return true; + } else { + return false; + } + } + + /** + * Clears the {@link BytesRef} and returns an {@link Entry} which maps to the + * given {@link BytesRef} + */ + public void clear() { + lastCount = count; + count = 0; + pool.reset(); + if (lastCount != -1 && shrink(lastCount)) { + // shrink clears the hash entries + return; + } + Arrays.fill(hash, -1); + Arrays.fill(ords, -1); + } + + /** + * Adds a new {@link BytesRef} + * + * @param bytes + * the bytes to hash + * @return the ord of the hashed bytes + */ + public int add(BytesRef bytes) { + return add(bytes, bytes.hashCode()); + } + + /** + * Adds a new {@link BytesRef} with a pre-calculated hash code. + * + * @param bytes + * the bytes to hash + * @param code + * the bytes hash code + * + *

+ * Hashcode is defined as: + * + *

+   * int hash = 0;
+   * for (int i = offset; i < offset + length; i++) {
+   *   hash = 31 * hash + bytes[i];
+   * }
+   * 
+ * + * @return the ord of the hashed bytes + */ + public int add(BytesRef bytes, int code) { + final int length = bytes.length; + // final position + int hashPos = code & hashMask; + int e = hash[hashPos]; + if (e != -1 && !equals(e, bytes)) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & hashMask; + e = hash[hashPos]; + } while (e != -1 && !equals(e, bytes)); + } + + if (e == -1) { + // new entry + final int len2 = 2 + bytes.length; + if (len2 + pool.byteUpto > BYTES_BLOCK_SIZE) { + if (len2 > BYTES_BLOCK_SIZE) { + throw new IllegalArgumentException("bytes can be at most " + + (BYTES_BLOCK_SIZE - 2) + " in length; got " + bytes.length); + } + pool.nextBuffer(); + } + final byte[] buffer = pool.buffer; + final int bufferUpto = pool.byteUpto; + e = count++; + ords[e] = bufferUpto + pool.byteOffset; + + // We first encode the length, followed by the + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + if (length < 128) { + // 1 byte to store length + buffer[bufferUpto] = (byte) length; + pool.byteUpto += length + 1; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, + length); + } else { + // 2 byte to store length + buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f)); + buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff); + pool.byteUpto += length + 2; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, + length); + } + assert hash[hashPos] == -1; + hash[hashPos] = e; + + if (count == hashHalfSize) { + rehash(2 * hashSize); + } + } + return e; + } + + /** + * Called when postings hash is too small (> 50% occupied) or too large (< 20% + * occupied). + */ + private void rehash(final int newSize) { + final int newMask = newSize - 1; + final int[] newHash = new int[newSize]; + ords = ArrayUtil.grow(ords, newSize); + Arrays.fill(newHash, -1); + for (int i = 0; i < hashSize; i++) { + final int e0 = hash[i]; + if (e0 != -1) { + int code; + final int off = ords[e0]; + final int start = off & BYTES_BLOCK_MASK; + final byte[] bytes = pool.buffers[off >> BYTES_BLOCK_SHIFT]; + code = 0; + final int len; + int pos; + if ((bytes[start] & 0x80) == 0) { + // length is 1 byte + len = bytes[start]; + pos = start + 1; + } else { + len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7); + pos = start + 2; + } + + final int endPos = pos + len; + while (pos < endPos) { + code = BytesRef.HASH_PRIME * code + bytes[pos++]; + } + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != -1) { + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != -1); + } + newHash[hashPos] = e0; + } + } + + hashMask = newMask; + hash = newHash; + hashSize = newSize; + hashHalfSize = newSize / 2; + } + + public long ramBytesUsed() { + return allocator.ramBytesUsed() + + RamUsageEstimator.NUM_BYTES_OBJ_REF + * hashSize + + count + * (RamUsageEstimator.NUM_BYTES_OBJ_HEADER + RamUsageEstimator.NUM_BYTES_INT * 2); + } +} Property changes on: src/java/org/apache/lucene/util/BytesRefHash.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/test/org/apache/lucene/index/TestByteSlices.java =================================================================== --- src/test/org/apache/lucene/index/TestByteSlices.java (revision 1000834) +++ src/test/org/apache/lucene/index/TestByteSlices.java (working copy) @@ -25,7 +25,7 @@ /* Allocate another byte[] from the shared pool */ @Override - synchronized byte[] getByteBlock() { + public synchronized byte[] getByteBlock() { final int size = freeByteBlocks.size(); final byte[] b; if (0 == size) @@ -37,13 +37,13 @@ /* Return a byte[] to the pool */ @Override - synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { + public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { for(int i=start;i blocks) { + public synchronized void recycleByteBlocks(List blocks) { final int size = blocks.size(); for(int i=0;i strings = new HashMap(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + int count = hash.size(); + int ord = hash.add(ref); + if (strings.put(str, Integer.valueOf(ord)) == null) { + assertEquals(i, ord); + assertEquals(hash.size(), count + 1); + } else { + assertTrue(ord < count); + assertEquals(hash.size(), count); + } + } + for (Entry entry : strings.entrySet()) { + ref.copy(entry.getKey()); + assertEquals(ref, hash.get(entry.getValue().intValue())); + } + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for {@link org.apache.lucene.util.BytesRefHash#compact()}. + */ + @Test + public void testCompact() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + final int size = 797; + BitSet bits = new BitSet(size); + for (int i = 0; i < size; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + bits.set(hash.add(ref)); + + } + assertEquals(hash.size(), bits.cardinality()); + int[] compact = hash.compact(); + assertTrue(size < compact.length); + for (int i = 0; i < size; i++) { + bits.set(compact[i], false); + } + assertEquals(0, bits.cardinality()); + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for + * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}. + */ + @Test + public void testSort() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + SortedSet strings = new TreeSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + hash.add(ref); + strings.add(str); + } + int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); + assertTrue(strings.size() < sort.length); + int i = 0; + for (String string : strings) { + ref.copy(string); + assertEquals(ref, hash.get(sort[i++])); + } + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for + * {@link org.apache.lucene.util.BytesRefHash#add(org.apache.lucene.util.BytesRef)} + * . + */ + @Test + public void testAdd() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + Set strings = new HashSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + int count = hash.size(); + int ord = hash.add(ref); + + if (strings.add(str)) { + assertEquals(i, ord); + assertEquals(hash.size(), count + 1); + } else { + assertTrue(ord < count); + assertEquals(hash.size(), count); + } + } + + int count = hash.size(); + for (String string : strings) { + ref.copy(string); + int ord = hash.add(ref); + assertEquals(count, hash.size()); + assertTrue("ord: " + ord + " count: " + count + " string: " + string, + ord < count); + } + hash.clear(); + assertEquals(0, hash.size()); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testLargeValue() { + int[] sizes = new int[] { random.nextInt(5), + BytesRefHash.BYTES_BLOCK_SIZE - 33 + random.nextInt(31), + BytesRefHash.BYTES_BLOCK_SIZE - 1 + random.nextInt(37) }; + BytesRef ref = new BytesRef(); + for (int i = 0; i < sizes.length; i++) { + ref.bytes = new byte[sizes[i]]; + ref.offset = 0; + ref.length = sizes[i]; + try { + assertEquals(i, hash.add(ref)); + } catch (IllegalArgumentException e) { + if (i < sizes.length - 1) + fail("unexpected exception at size: " + sizes[i]); + throw e; + } + } + } + + @Test + public void testSortedIterator() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + SortedSet strings = new TreeSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + hash.add(ref); + strings.add(str); + } + HashEntryIterator sortedEntries = hash.sortedEntries(new BytesRef(), BytesRef.getUTF8SortedAsUTF16Comparator()); + for (String string : strings) { + ref.copy(string); + assertTrue(sortedEntries.next()); + assertEquals(ref, sortedEntries.bytes); + } + assertFalse(sortedEntries.next()); + + hash.clear(); + assertEquals(0, hash.size()); + } + } + +} Property changes on: src/test/org/apache/lucene/util/TestBytesRefHash.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL