Index: lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java =================================================================== --- lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java (revision 1457395) +++ lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java (working copy) @@ -30,16 +30,11 @@ import org.junit.Before; import org.junit.Test; -/** - * - */ public class TestBytesRefHash extends LuceneTestCase { BytesRefHash hash; ByteBlockPool pool; - /** - */ @Override @Before public void setUp() throws Exception { @@ -248,6 +243,43 @@ hash.reinit(); } } + + @Test + public void testFind() throws Exception { + BytesRef ref = new BytesRef(); + BytesRef scratch = new BytesRef(); + int num = atLeast(2); + for (int j = 0; j < num; j++) { + Set strings = new HashSet(); + int uniqueCount = 0; + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random(), 1000); + } while (str.length() == 0); + ref.copyChars(str); + int count = hash.size(); + int key = hash.find(ref); //hash.add(ref); + if (key >= 0) { // string found in hash + assertFalse(strings.add(str)); + assertTrue(key < count); + assertEquals(str, hash.get(key, scratch).utf8ToString()); + assertEquals(count, hash.size()); + } else { + key = hash.add(ref); + assertTrue(strings.add(str)); + assertEquals(uniqueCount, key); + assertEquals(hash.size(), count + 1); + uniqueCount++; + } + } + + assertAllIn(strings, hash); + hash.clear(); + assertEquals(0, hash.size()); + hash.reinit(); + } + } @Test(expected = MaxBytesLengthExceededException.class) public void testLargeValue() { Index: lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java (revision 1457395) +++ lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java (working copy) @@ -30,8 +30,8 @@ /** * {@link BytesRefHash} is a special purpose hash-map like data-structure * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of - * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes - * efficiently in continuous storage. The mapping to the ordinal is + * byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes + * efficiently in continuous storage. The mapping to the id is * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased * for each added {@link BytesRef}. * @@ -58,7 +58,7 @@ private int hashMask; private int count; private int lastCount = -1; - private int[] ords; + private int[] ids; private final BytesStartArray bytesStartArray; private Counter bytesUsed; @@ -86,8 +86,8 @@ hashHalfSize = hashSize >> 1; hashMask = hashSize - 1; this.pool = pool; - ords = new int[hashSize]; - Arrays.fill(ords, -1); + ids = new int[hashSize]; + Arrays.fill(ids, -1); this.bytesStartArray = bytesStartArray; bytesStart = bytesStartArray.init(); bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed(); @@ -104,26 +104,29 @@ } /** - * Populates and returns a {@link BytesRef} with the bytes for the given ord. + * Populates and returns a {@link BytesRef} with the bytes for the given + * bytesID. *

- * Note: the given ord must be a positive integer less that the current size ( - * {@link #size()}) - *

- * - * @param ord the ord - * @param ref the {@link BytesRef} to populate + * Note: the given bytesID must be a positive integer less that the current + * size ({@link #size()}) * - * @return the given BytesRef instance populated with the bytes for the given ord + * @param bytesID + * the id + * @param ref + * the {@link BytesRef} to populate + * + * @return the given BytesRef instance populated with the bytes for the given + * bytesID */ - public BytesRef get(int ord, BytesRef ref) { + public BytesRef get(int bytesID, BytesRef ref) { assert bytesStart != null : "bytesStart is null - not initialized"; - assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length; - pool.setBytesRef(ref, bytesStart[ord]); + assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length; + pool.setBytesRef(ref, bytesStart[bytesID]); return ref; } /** - * Returns the ords array in arbitrary order. Valid ords start at offset of 0 + * Returns the ids array in arbitrary order. Valid ids start at offset of 0 * and end at a limit of {@link #size()} - 1 *

* Note: This is a destructive operation. {@link #clear()} must be called in @@ -131,13 +134,13 @@ *

*/ int[] compact() { - assert bytesStart != null : "Bytesstart is null - not initialized"; + assert bytesStart != null : "bytesStart is null - not initialized"; int upto = 0; for (int i = 0; i < hashSize; i++) { - if (ords[i] != -1) { + if (ids[i] != -1) { if (upto < i) { - ords[upto] = ords[i]; - ords[i] = -1; + ids[upto] = ids[i]; + ids[i] = -1; } upto++; } @@ -145,7 +148,7 @@ assert upto == count; lastCount = count; - return ords; + return ids; } /** @@ -170,25 +173,25 @@ @Override protected int compare(int i, int j) { - final int ord1 = compact[i], ord2 = compact[j]; - assert bytesStart.length > ord1 && bytesStart.length > ord2; - pool.setBytesRef(scratch1, bytesStart[ord1]); - pool.setBytesRef(scratch2, bytesStart[ord2]); + final int id1 = compact[i], id2 = compact[j]; + assert bytesStart.length > id1 && bytesStart.length > id2; + pool.setBytesRef(scratch1, bytesStart[id1]); + pool.setBytesRef(scratch2, bytesStart[id2]); return comp.compare(scratch1, scratch2); } @Override protected void setPivot(int i) { - final int ord = compact[i]; - assert bytesStart.length > ord; - pool.setBytesRef(pivot, bytesStart[ord]); + final int id = compact[i]; + assert bytesStart.length > id; + pool.setBytesRef(pivot, bytesStart[id]); } @Override protected int comparePivot(int j) { - final int ord = compact[j]; - assert bytesStart.length > ord; - pool.setBytesRef(scratch2, bytesStart[ord]); + final int id = compact[j]; + assert bytesStart.length > id; + pool.setBytesRef(scratch2, bytesStart[id]); return comp.compare(pivot, scratch2); } @@ -198,8 +201,8 @@ return compact; } - private boolean equals(int ord, BytesRef b) { - pool.setBytesRef(scratch1, bytesStart[ord]); + private boolean equals(int id, BytesRef b) { + pool.setBytesRef(scratch1, bytesStart[id]); return scratch1.bytesEquals(b); } @@ -213,8 +216,8 @@ if (newSize != hashSize) { bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize)); hashSize = newSize; - ords = new int[hashSize]; - Arrays.fill(ords, -1); + ids = new int[hashSize]; + Arrays.fill(ids, -1); hashHalfSize = newSize / 2; hashMask = newSize - 1; return true; @@ -237,7 +240,7 @@ // shrink clears the hash entries return; } - Arrays.fill(ords, -1); + Arrays.fill(ids, -1); } public void clear() { @@ -249,7 +252,7 @@ */ public void close() { clear(true); - ords = null; + ids = null; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize); } @@ -258,8 +261,8 @@ * * @param bytes * the bytes to hash - * @return the ord the given bytes are hashed if there was no mapping for the - * given bytes, otherwise (-(ord)-1). This guarantees + * @return the id the given bytes are hashed if there was no mapping for the + * given bytes, otherwise (-(id)-1). This guarantees * that the return value will always be >= 0 if the given bytes * haven't been hashed before. * @@ -289,8 +292,8 @@ * } * * - * @return the ord the given bytes are hashed if there was no mapping for the - * given bytes, otherwise (-(ord)-1). This guarantees + * @return the id the given bytes are hashed if there was no mapping for the + * given bytes, otherwise (-(id)-1). This guarantees * that the return value will always be >= 0 if the given bytes * haven't been hashed before. * @@ -302,19 +305,9 @@ assert bytesStart != null : "Bytesstart is null - not initialized"; final int length = bytes.length; // final position - int hashPos = code & hashMask; - int e = ords[hashPos]; - if (e != -1 && !equals(e, bytes)) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & hashMask; - e = ords[hashPos]; - } while (e != -1 && !equals(e, bytes)); - } - + final int hashPos = findHash(bytes, code); + int e = ids[hashPos]; + if (e == -1) { // new entry final int len2 = 2 + bytes.length; @@ -355,8 +348,8 @@ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, length); } - assert ords[hashPos] == -1; - ords[hashPos] = e; + assert ids[hashPos] == -1; + ids[hashPos] = e; if (count == hashHalfSize) { rehash(2 * hashSize, true); @@ -365,13 +358,56 @@ } return -(e + 1); } + + /** + * Returns the id of the given {@link BytesRef}. + * + * @see #find(BytesRef, int) + */ + public int find(BytesRef bytes) { + return find(bytes, bytes.hashCode()); + } + /** + * Returns the id of the given {@link BytesRef} with a pre-calculated hash code. + * + * @param bytes + * the bytes to look for + * @param code + * the bytes hash code + * + * @return the id of the given bytes, or {@code -1} if there is no mapping for the + * given bytes. + */ + public int find(BytesRef bytes, int code) { + return ids[findHash(bytes, code)]; + } + + private final int findHash(BytesRef bytes, int code) { + assert bytesStart != null : "Bytesstart is null - not initialized"; + // final position + int hashPos = code & hashMask; + int e = ids[hashPos]; + if (e != -1 && !equals(e, bytes)) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & hashMask; + e = ids[hashPos]; + } while (e != -1 && !equals(e, bytes)); + } + + return hashPos; + } + public int addByPoolOffset(int offset) { assert bytesStart != null : "Bytesstart is null - not initialized"; // final position int code = offset; int hashPos = offset & hashMask; - int e = ords[hashPos]; + int e = ids[hashPos]; if (e != -1 && bytesStart[e] != offset) { // Conflict: keep searching different locations in // the hash table. @@ -379,7 +415,7 @@ do { code += inc; hashPos = code & hashMask; - e = ords[hashPos]; + e = ids[hashPos]; } while (e != -1 && bytesStart[e] != offset); } if (e == -1) { @@ -391,8 +427,8 @@ } e = count++; bytesStart[e] = offset; - assert ords[hashPos] == -1; - ords[hashPos] = e; + assert ids[hashPos] == -1; + ids[hashPos] = e; if (count == hashHalfSize) { rehash(2 * hashSize, false); @@ -412,7 +448,7 @@ final int[] newHash = new int[newSize]; Arrays.fill(newHash, -1); for (int i = 0; i < hashSize; i++) { - final int e0 = ords[i]; + final int e0 = ids[i]; if (e0 != -1) { int code; if (hashOnData) { @@ -453,8 +489,8 @@ } hashMask = newMask; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length)); - ords = newHash; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length)); + ids = newHash; hashSize = newSize; hashHalfSize = newSize / 2; } @@ -469,25 +505,25 @@ bytesStart = bytesStartArray.init(); } - if (ords == null) { - ords = new int[hashSize]; + if (ids == null) { + ids = new int[hashSize]; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize); } } /** * Returns the bytesStart offset into the internally used - * {@link ByteBlockPool} for the given ord + * {@link ByteBlockPool} for the given bytesID * - * @param ord - * the ord to look up + * @param bytesID + * the id to look up * @return the bytesStart offset into the internally used - * {@link ByteBlockPool} for the given ord + * {@link ByteBlockPool} for the given id */ - public int byteStart(int ord) { - assert bytesStart != null : "Bytesstart is null - not initialized"; - assert ord >= 0 && ord < count : ord; - return bytesStart[ord]; + public int byteStart(int bytesID) { + assert bytesStart != null : "bytesStart is null - not initialized"; + assert bytesID >= 0 && bytesID < count : bytesID; + return bytesStart[bytesID]; } /**