Index: lucene/core/src/java/org/apache/lucene/util/SentinelIntSet.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.util;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.util.Arrays;\n\n/**\n * A native int set where one value is reserved to mean \"EMPTY\"\n *\n * @lucene.internal\n */\npublic class SentinelIntSet {\n public int[] keys;\n public int count;\n public final int emptyVal;\n public int rehashCount; // the count at which a rehash should be done\n\n /**\n *\n * @param size The minimum number of elements this set should be able to hold without re-hashing (i.e. the slots are guaranteed not to change)\n * @param emptyVal The integer value to use for EMPTY\n */\n public SentinelIntSet(int size, int emptyVal) {\n this.emptyVal = emptyVal;\n int tsize = Math.max(org.apache.lucene.util.BitUtil.nextHighestPowerOfTwo(size), 1);\n rehashCount = tsize - (tsize>>2);\n if (size >= rehashCount) { // should be able to hold \"size\" w/o rehashing\n tsize <<= 1;\n rehashCount = tsize - (tsize>>2);\n }\n keys = new int[tsize];\n if (emptyVal != 0)\n clear();\n }\n\n public void clear() {\n Arrays.fill(keys, emptyVal);\n count = 0;\n }\n\n public int hash(int key) {\n return key;\n }\n\n public int size() { return count; }\n\n /** returns the slot for this key */\n public int getSlot(int key) {\n assert key != emptyVal;\n int h = hash(key);\n int s = h & (keys.length-1);\n if (keys[s] == key || keys[s]== emptyVal) return s;\n\n int increment = (h>>7)|1;\n do {\n s = (s + increment) & (keys.length-1);\n } while (keys[s] != key && keys[s] != emptyVal);\n return s;\n }\n\n /** returns the slot for this key, or -slot-1 if not found */\n public int find(int key) {\n assert key != emptyVal;\n int h = hash(key);\n int s = h & (keys.length-1);\n if (keys[s] == key) return s;\n if (keys[s] == emptyVal) return -s-1;\n\n int increment = (h>>7)|1;\n for(;;) {\n s = (s + increment) & (keys.length-1);\n if (keys[s] == key) return s;\n if (keys[s] == emptyVal) return -s-1;\n }\n }\n\n public boolean exists(int key) {\n return find(key) >= 0;\n }\n\n public int put(int key) {\n int s = find(key);\n if (s < 0) {\n count++;\n if (count >= rehashCount) {\n rehash();\n s = getSlot(key);\n } else {\n s = -s-1;\n }\n keys[s] = key;\n }\n return s;\n }\n\n public void rehash() {\n int newSize = keys.length << 1;\n int[] oldKeys = keys;\n keys = new int[newSize];\n if (emptyVal != 0) Arrays.fill(keys, emptyVal);\n\n for (int i=0; i>2);\n }\n}\n =================================================================== --- lucene/core/src/java/org/apache/lucene/util/SentinelIntSet.java (revision 0f77e93b5c952ddafa7215db92d6b995526005e0) +++ lucene/core/src/java/org/apache/lucene/util/SentinelIntSet.java (revision ) @@ -20,7 +20,20 @@ import java.util.Arrays; /** - * A native int set where one value is reserved to mean "EMPTY" + * A native int hash-based set where one value is reserved to mean "EMPTY" internally. The space overhead is fairly low + * as there is only one power-of-two sized int[] to hold the values. The set is re-hashed when adding a value that + * would make it >= 75% full. Consider extending and over-riding {@link #hash(int)} if the values might be poor + * hash keys; Lucene docids should be fine. + * The internal fields are exposed publicly to enable more efficient use at the expense of better O-O principles. + *

+ * To iterate over the integers held in this set, simply use code like this: + *

+ * SentinelIntSet set = ...
+ * for (int v : set.keys) {
+ *   if (v == set.emptyVal)
+ *     continue;
+ *   //use v...
+ * }
* * @lucene.internal */ @@ -32,14 +45,14 @@ /** * - * @param size The minimum number of elements this set should be able to hold without re-hashing (i.e. the slots are guaranteed not to change) + * @param size The minimum number of elements this set should be able to hold without rehashing (i.e. the slots are guaranteed not to change) * @param emptyVal The integer value to use for EMPTY */ public SentinelIntSet(int size, int emptyVal) { this.emptyVal = emptyVal; int tsize = Math.max(org.apache.lucene.util.BitUtil.nextHighestPowerOfTwo(size), 1); rehashCount = tsize - (tsize>>2); - if (size >= rehashCount) { // should be able to hold "size" w/o rehashing + if (size >= rehashCount) { // should be able to hold "size" w/o re-hashing tsize <<= 1; rehashCount = tsize - (tsize>>2); } @@ -53,13 +66,16 @@ count = 0; } + /** (internal) Return the hash for the key. The default implementation just returns the key, which is not appropriate + * for general purpose use. + */ public int hash(int key) { return key; } public int size() { return count; } - /** returns the slot for this key */ + /** (internal) Returns the slot for this key */ public int getSlot(int key) { assert key != emptyVal; int h = hash(key); @@ -73,7 +89,7 @@ return s; } - /** returns the slot for this key, or -slot-1 if not found */ + /** (internal) Returns the slot for this key, or -slot-1 if not found */ public int find(int key) { assert key != emptyVal; int h = hash(key); @@ -93,6 +109,8 @@ return find(key) >= 0; } + /** Puts this key in the set, and returns the slot index it was added to. It rehashes if adding it would make the set + * more than 75% full. */ public int put(int key) { int s = find(key); if (s < 0) { @@ -108,14 +126,14 @@ return s; } + /** (internal) Rehashes by doubling {@code int[] key} and filling with the old values. */ public void rehash() { int newSize = keys.length << 1; int[] oldKeys = keys; keys = new int[newSize]; if (emptyVal != 0) Arrays.fill(keys, emptyVal); - for (int i=0; i