Index: src/java/org/apache/lucene/index/ByteBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/ByteBlockPool.java (revision 1001293) +++ src/java/org/apache/lucene/index/ByteBlockPool.java (working copy) @@ -1,169 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Class that Posting and PostingVector use to write byte - * streams into shared fixed-size byte[] arrays. The idea - * is to allocate slices of increasing lengths For - * example, the first slice is 5 bytes, the next slice is - * 14, etc. We start by writing our bytes into the first - * 5 bytes. When we hit the end of the slice, we allocate - * the next slice and then write the address of the new - * slice into the last 4 bytes of the previous slice (the - * "forwarding address"). - * - * Each slice is filled with 0's initially, and we mark - * the end with a non-zero byte. This way the methods - * that are writing into the slice don't need to record - * its length and instead allocate a new slice once they - * hit a non-zero byte. */ - -import java.util.Arrays; -import org.apache.lucene.util.BytesRef; -import java.util.List; -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; -import org.apache.lucene.util.ArrayUtil; - -final class ByteBlockPool { - - abstract static class Allocator { - abstract void recycleByteBlocks(byte[][] blocks, int start, int end); - abstract void recycleByteBlocks(List blocks); - abstract byte[] getByteBlock(); - } - - public byte[][] buffers = new byte[10][]; - - int bufferUpto = -1; // Which buffer we are upto - public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer - - public byte[] buffer; // Current head buffer - public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset - - private final Allocator allocator; - - public ByteBlockPool(Allocator allocator) { - this.allocator = allocator; - } - - public void reset() { - if (bufferUpto != -1) { - // We allocated at least one buffer - - for(int i=0;i 0) - // Recycle all but the first buffer - allocator.recycleByteBlocks(buffers, 1, 1+bufferUpto); - - // Re-use the first buffer - bufferUpto = 0; - byteUpto = 0; - byteOffset = 0; - buffer = buffers[0]; - } - } - - public void nextBuffer() { - if (1+bufferUpto == buffers.length) { - byte[][] newBuffers = new byte[ArrayUtil.oversize(buffers.length+1, - NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); - buffers = newBuffers; - } - buffer = buffers[1+bufferUpto] = allocator.getByteBlock(); - bufferUpto++; - - byteUpto = 0; - byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE; - } - - public int newSlice(final int size) { - if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size) - nextBuffer(); - final int upto = byteUpto; - byteUpto += size; - buffer[byteUpto-1] = 16; - return upto; - } - - // Size of each slice. These arrays should be at most 16 - // elements (index is encoded with 4 bits). First array - // is just a compact way to encode X+1 with a max. Second - // array is the length of each slice, ie first slice is 5 - // bytes, next slice is 14 bytes, etc. - final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9}; - final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200}; - final static int FIRST_LEVEL_SIZE = levelSizeArray[0]; - - public int allocSlice(final byte[] slice, final int upto) { - - final int level = slice[upto] & 15; - final int newLevel = nextLevelArray[level]; - final int newSize = levelSizeArray[newLevel]; - - // Maybe allocate another block - if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize) - nextBuffer(); - - final int newUpto = byteUpto; - final int offset = newUpto + byteOffset; - byteUpto += newSize; - - // Copy forward the past 3 bytes (which we are about - // to overwrite with the forwarding address): - buffer[newUpto] = slice[upto-3]; - buffer[newUpto+1] = slice[upto-2]; - buffer[newUpto+2] = slice[upto-1]; - - // Write forwarding address at end of last slice: - slice[upto-3] = (byte) (offset >>> 24); - slice[upto-2] = (byte) (offset >>> 16); - slice[upto-1] = (byte) (offset >>> 8); - slice[upto] = (byte) offset; - - // Write new level: - buffer[byteUpto-1] = (byte) (16|newLevel); - - return newUpto+3; - } - - // Fill in a BytesRef from term's length & bytes encoded in - // byte block - final BytesRef setBytesRef(BytesRef term, int textStart) { - final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; - int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK; - if ((bytes[pos] & 0x80) == 0) { - // length is 1 byte - term.length = bytes[pos]; - term.offset = pos+1; - } else { - // length is 2 bytes - term.length = (bytes[pos]&0x7f) + ((bytes[pos+1]&0xff)<<7); - term.offset = pos+2; - } - assert term.length >= 0; - return term; - } -} - Index: src/java/org/apache/lucene/index/ByteSliceReader.java =================================================================== --- src/java/org/apache/lucene/index/ByteSliceReader.java (revision 1001293) +++ src/java/org/apache/lucene/index/ByteSliceReader.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ByteBlockPool; /* IndexInput that knows how to read the byte slices written * by Posting and PostingVector. We read the bytes in @@ -48,16 +49,16 @@ this.endIndex = endIndex; level = 0; - bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE; - bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE; + bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE; + bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE; buffer = pool.buffers[bufferUpto]; - upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK; + upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK; final int firstSize = ByteBlockPool.levelSizeArray[0]; if (startIndex+firstSize >= endIndex) { // There is only this one slice to read - limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK; + limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK; } else limit = upto+firstSize-4; } @@ -102,11 +103,11 @@ level = ByteBlockPool.nextLevelArray[level]; final int newSize = ByteBlockPool.levelSizeArray[level]; - bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE; - bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE; + bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE; + bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE; buffer = pool.buffers[bufferUpto]; - upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK; + upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK; if (nextIndex + newSize >= endIndex) { // We are advancing to the final slice Index: src/java/org/apache/lucene/index/ByteSliceWriter.java =================================================================== --- src/java/org/apache/lucene/index/ByteSliceWriter.java (revision 1001293) +++ src/java/org/apache/lucene/index/ByteSliceWriter.java (working copy) @@ -1,6 +1,7 @@ package org.apache.lucene.index; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ByteBlockPool; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -42,9 +43,9 @@ * Set up the writer to write at address. */ public void init(int address) { - slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT]; + slice = pool.buffers[address >> ByteBlockPool.BYTE_BLOCK_SHIFT]; assert slice != null; - upto = address & DocumentsWriter.BYTE_BLOCK_MASK; + upto = address & ByteBlockPool.BYTE_BLOCK_MASK; offset0 = address; assert upto < slice.length; } Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 1001293) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -27,6 +27,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map.Entry; +import java.util.concurrent.locks.ReentrantLock; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; @@ -41,8 +42,11 @@ import org.apache.lucene.store.RAMFile; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Constants; +import org.apache.lucene.util.RecyclingByteBlockAllocator; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.RamUsageEstimator; +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK; +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; /** * This class accepts multiple added documents and directly @@ -637,7 +641,7 @@ for(int i=0;i= ramBufferSize) || + (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + bytesUsed()) >= ramBufferSize) || (maxBufferedDeleteTerms != IndexWriterConfig.DISABLE_AUTO_FLUSH && ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms)); } @@ -1256,7 +1260,7 @@ final SkipDocWriter skipDocWriter = new SkipDocWriter(); long getRAMUsed() { - return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; + return bytesUsed() + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; } long numBytesUsed; @@ -1295,63 +1299,12 @@ /* Initial chunks size of the shared byte[] blocks used to store postings data */ - final static int BYTE_BLOCK_SHIFT = 15; - final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT; - final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK; /* if you increase this, you must fix field cache impl for * getTerms/getTermsIndex requires <= 32768 */ final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2; - private class ByteBlockAllocator extends ByteBlockPool.Allocator { - final int blockSize; - - ByteBlockAllocator(int blockSize) { - this.blockSize = blockSize; - } - - ArrayList freeByteBlocks = new ArrayList(); - - /* Allocate another byte[] from the shared pool */ - @Override - byte[] getByteBlock() { - synchronized(DocumentsWriter.this) { - final int size = freeByteBlocks.size(); - final byte[] b; - if (0 == size) { - b = new byte[blockSize]; - numBytesUsed += blockSize; - } else - b = freeByteBlocks.remove(size-1); - return b; - } - } - - /* Return byte[]'s to the pool */ - - @Override - void recycleByteBlocks(byte[][] blocks, int start, int end) { - synchronized(DocumentsWriter.this) { - for(int i=start;i blocks) { - synchronized(DocumentsWriter.this) { - final int size = blocks.size(); - for(int i=0;i= ramBufferSize; + doBalance = bytesUsed() +deletesRAMUsed >= ramBufferSize; } if (doBalance) { if (infoStream != null) - message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + + message(" RAM: now balance allocations: usedMB=" + toMB(bytesUsed()) + " vs trigger=" + toMB(ramBufferSize) + " deletesMB=" + toMB(deletesRAMUsed) + - " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + - " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE)); + " byteBlockFree=" + toMB(byteBlockAllocator.bytesUsed()) + + " perDocFree=" + toMB(perDocAllocator.bytesUsed())); - final long startBytesUsed = numBytesUsed + deletesRAMUsed; + final long startBytesUsed = bytesUsed() + deletesRAMUsed; int iter = 0; @@ -1437,16 +1394,16 @@ boolean any = true; - while(numBytesUsed+deletesRAMUsed > freeLevel) { + while(bytesUsed()+deletesRAMUsed > freeLevel) { synchronized(this) { - if (0 == perDocAllocator.freeByteBlocks.size() && - 0 == byteBlockAllocator.freeByteBlocks.size() && + if (0 == perDocAllocator.numBufferedBlocks() && + 0 == byteBlockAllocator.numBufferedBlocks() && 0 == freeIntBlocks.size() && !any) { // Nothing else to free -- must flush now. - bufferIsFull = numBytesUsed+deletesRAMUsed > ramBufferSize; + bufferIsFull = bytesUsed()+deletesRAMUsed > ramBufferSize; if (infoStream != null) { - if (numBytesUsed+deletesRAMUsed > ramBufferSize) + if (bytesUsed()+deletesRAMUsed > ramBufferSize) message(" nothing to free; now set bufferIsFull"); else message(" nothing to free"); @@ -1454,8 +1411,8 @@ break; } - if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) { - byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); + if ((0 == iter % 4) && byteBlockAllocator.numBufferedBlocks() > 0) { + byteBlockAllocator.freeBlocks(1); numBytesUsed -= BYTE_BLOCK_SIZE; } @@ -1464,15 +1421,8 @@ numBytesUsed -= INT_BLOCK_SIZE * INT_NUM_BYTE; } - if ((2 == iter % 4) && perDocAllocator.freeByteBlocks.size() > 0) { - // Remove upwards of 32 blocks (each block is 1K) - for (int i = 0; i < 32; ++i) { - perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1); - numBytesUsed -= PER_DOC_BLOCK_SIZE; - if (perDocAllocator.freeByteBlocks.size() == 0) { - break; - } - } + if ((2 == iter % 4) && perDocAllocator.numBufferedBlocks() > 0) { + perDocAllocator.freeBlocks(32); // Remove upwards of 32 blocks (each block is 1K) } } @@ -1484,7 +1434,7 @@ } if (infoStream != null) - message(" after free: freedMB=" + nf.format((startBytesUsed-numBytesUsed-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.)); + message(" after free: freedMB=" + nf.format((startBytesUsed-bytesUsed()-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((bytesUsed()+deletesRAMUsed)/1024./1024.)); } } Index: src/java/org/apache/lucene/index/FreqProxFieldMergeState.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (revision 1001293) +++ src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (working copy) @@ -19,6 +19,8 @@ import java.io.IOException; import java.util.Comparator; + +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray; @@ -50,7 +52,7 @@ public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, Comparator termComp) { this.field = field; - this.numPostings = field.termsHashPerField.numPostings; + this.numPostings = field.termsHashPerField.bytesHash.size(); this.bytePool = field.perThread.termsHashPerThread.bytePool; this.termIDs = field.termsHashPerField.sortPostings(termComp); this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray; Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 1001293) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -67,7 +67,7 @@ for (final TermsHashConsumerPerField i : fields) { final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i; - if (perField.termsHashPerField.numPostings > 0) + if (perField.termsHashPerField.bytesHash.size() > 0) allFields.add(perField); } } @@ -116,7 +116,7 @@ for(int i=0;i postings.lastDocIDs[termID]; + assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID; // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 1001293) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { @@ -80,7 +81,7 @@ assert perThread.doc.docID == docState.docID; - if (termsHashPerField.numPostings != 0) { + if (termsHashPerField.bytesHash.size() != 0) { // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: @@ -106,7 +107,7 @@ assert docState.testPoint("TermVectorsTermsWriterPerField.finish start"); - final int numPostings = termsHashPerField.numPostings; + final int numPostings = termsHashPerField.bytesHash.size(); final BytesRef flushTerm = perThread.flushTerm; Index: src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 1001293) +++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -18,15 +18,17 @@ */ import java.io.IOException; -import java.util.Arrays; import java.util.Comparator; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.BytesRefHash.MaxKeyLengthExceededException; final class TermsHashPerField extends InvertedDocConsumerPerField { + private static final int HASH_INIT_SIZE = 4; final TermsHashConsumerPerField consumer; @@ -46,28 +48,20 @@ final FieldInfo fieldInfo; - boolean postingsCompacted; - int numPostings; - private int postingsHashSize = 4; - private int postingsHashHalfSize = postingsHashSize/2; - private int postingsHashMask = postingsHashSize-1; - private int[] postingsHash; + // nocommit - how to communicate byte usage to DocumentsWriter? + final BytesRefHash bytesHash; ParallelPostingsArray postingsArray; private final BytesRef utf8; - private Comparator termComp; public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; bytePool = perThread.bytePool; termBytePool = perThread.termBytePool; + bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE); // TODO maybe 16 is a better default? docState = perThread.docState; - postingsHash = new int[postingsHashSize]; - Arrays.fill(postingsHash, -1); - bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT); - fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); initPostingsArray(); @@ -95,19 +89,7 @@ } void shrinkHash(int targetSize) { - assert postingsCompacted || numPostings == 0; - - final int newSize = 4; - if (newSize != postingsHash.length) { - final long previousSize = postingsHash.length; - postingsHash = new int[newSize]; - bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT); - Arrays.fill(postingsHash, -1); - postingsHashSize = newSize; - postingsHashHalfSize = newSize/2; - postingsHashMask = newSize-1; - } - + bytesHash.clear(false); // Fully free the postings array on each flush: if (postingsArray != null) { bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size); @@ -116,14 +98,7 @@ } public void reset() { - if (!postingsCompacted) - compactPostings(); - assert numPostings <= postingsHash.length; - if (numPostings > 0) { - Arrays.fill(postingsHash, 0, numPostings, -1); - numPostings = 0; - } - postingsCompacted = false; + bytesHash.clear(false); if (nextPerField != null) nextPerField.reset(); } @@ -151,139 +126,12 @@ ints[upto+stream]); } - private synchronized void compactPostings() { - int upto = 0; - for(int i=0;i termComp) { - this.termComp = termComp; - compactPostings(); - quickSort(postingsHash, 0, numPostings-1); - return postingsHash; - } - - void quickSort(int[] termIDs, int lo, int hi) { - if (lo >= hi) - return; - else if (hi == 1+lo) { - if (comparePostings(termIDs[lo], termIDs[hi]) > 0) { - final int tmp = termIDs[lo]; - termIDs[lo] = termIDs[hi]; - termIDs[hi] = tmp; - } - return; - } - - int mid = (lo + hi) >>> 1; - - if (comparePostings(termIDs[lo], termIDs[mid]) > 0) { - int tmp = termIDs[lo]; - termIDs[lo] = termIDs[mid]; - termIDs[mid] = tmp; - } - - if (comparePostings(termIDs[mid], termIDs[hi]) > 0) { - int tmp = termIDs[mid]; - termIDs[mid] = termIDs[hi]; - termIDs[hi] = tmp; - - if (comparePostings(termIDs[lo], termIDs[mid]) > 0) { - int tmp2 = termIDs[lo]; - termIDs[lo] = termIDs[mid]; - termIDs[mid] = tmp2; - } - } - - int left = lo + 1; - int right = hi - 1; - - if (left >= right) - return; - - int partition = termIDs[mid]; - - for (; ;) { - while (comparePostings(termIDs[right], partition) > 0) - --right; - - while (left < right && comparePostings(termIDs[left], partition) <= 0) - ++left; - - if (left < right) { - int tmp = termIDs[left]; - termIDs[left] = termIDs[right]; - termIDs[right] = tmp; - --right; - } else { - break; - } - } - - quickSort(termIDs, lo, left); - quickSort(termIDs, left + 1, hi); - } - - /** Compares term text for two Posting instance and - * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ - int comparePostings(int term1, int term2) { - - if (term1 == term2) { - // Our quicksort does this, eg during partition - return 0; - } - - termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]); - termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]); - - return termComp.compare(perThread.tr1, perThread.tr2); + return bytesHash.sort(termComp); } - /** Test whether the text for current RawPostingList p equals - * current tokenText in utf8. */ - private boolean postingEquals(final int termID) { - final int textStart = postingsArray.textStarts[termID]; - final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; - assert text != null; - - int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK; - - final int len; - if ((text[pos] & 0x80) == 0) { - // length is 1 byte - len = text[pos]; - pos += 1; - } else { - // length is 2 bytes - len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7); - pos += 2; - } - - if (len == utf8.length) { - final byte[] utf8Bytes = utf8.bytes; - for(int tokenPos=0;tokenPos>8)+code)|1; - do { - code += inc; - hashPos = code & postingsHashMask; - termID = postingsHash[hashPos]; - } while (termID != -1 && postingsArray.textStarts[termID] != textStart); - } - - if (termID == -1) { + int termID = bytesHash.addByOffset(textStart); + if (termID >= 0) { // First time we are seeing this token since we last // flushed the hash. // New posting - termID = numPostings++; if (termID >= postingsArray.size) { growParallelPostingsArray(); } @@ -347,17 +175,11 @@ postingsArray.textStarts[termID] = textStart; - assert postingsHash[hashPos] == -1; - postingsHash[hashPos] = termID; - - if (numPostings == postingsHashHalfSize) - rehashPostings(2*postingsHashSize); - // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.nextBuffer(); - if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.nextBuffer(); intUptos = intPool.buffer; @@ -375,6 +197,7 @@ consumer.newTerm(termID); } else { + termID = (-termID)-1; int intStart = postingsArray.intStarts[termID]; intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK; @@ -386,105 +209,43 @@ @Override void add() throws IOException { - assert !postingsCompacted; - // We are first in the chain so we must "intern" the // term text into textStart address - // Get the text & hash of this term. - int code = termAtt.toBytesRef(utf8); - - int hashPos = code & postingsHashMask; - - // Locate RawPostingList in hash - int termID = postingsHash[hashPos]; - - if (termID != -1 && !postingEquals(termID)) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code>>8)+code)|1; - do { - code += inc; - hashPos = code & postingsHashMask; - termID = postingsHash[hashPos]; - } while (termID != -1 && !postingEquals(termID)); - } - - if (termID == -1) { - - // First time we are seeing this token since we last - // flushed the hash. - final int textLen2 = 2+utf8.length; - if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) { - // Not enough room in current block - - if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) { - // Just skip this term, to remain as robust as - // possible during indexing. A TokenFilter - // can be inserted into the analyzer chain if - // other behavior is wanted (pruning the term - // to a prefix, throwing an exception, etc). - if (docState.maxTermPrefix == null) { - final int saved = utf8.length; - try { - utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8); - docState.maxTermPrefix = utf8.toString(); - } finally { - utf8.length = saved; - } - } - - consumer.skippingLongTerm(); - return; + int termID; + try{ + termID = bytesHash.add(utf8, termAtt.toBytesRef(utf8)); + }catch (MaxKeyLengthExceededException e) { + // Not enough room in current block + // Just skip this term, to remain as robust as + // possible during indexing. A TokenFilter + // can be inserted into the analyzer chain if + // other behavior is wanted (pruning the term + // to a prefix, throwing an exception, etc). + if (docState.maxTermPrefix == null) { + final int saved = utf8.length; + try { + utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8); + docState.maxTermPrefix = utf8.toString(); + } finally { + utf8.length = saved; } - bytePool.nextBuffer(); } - - // New posting - termID = numPostings++; + consumer.skippingLongTerm(); + return; + } + if (termID >= 0) {// New posting if (termID >= postingsArray.size) { growParallelPostingsArray(); } - - assert termID != -1; - assert postingsHash[hashPos] == -1; - - postingsHash[hashPos] = termID; - - final byte[] text = bytePool.buffer; - final int textUpto = bytePool.byteUpto; - postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset; - - // We first encode the length, followed by the UTF8 - // bytes. Length is encoded as vInt, but will consume - // 1 or 2 bytes at most (we reject too-long terms, - // above). - - // encode length @ start of bytes - if (utf8.length < 128) { - // 1 byte to store length - text[textUpto] = (byte) utf8.length; - bytePool.byteUpto += utf8.length + 1; - System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length); - } else { - // 2 byte to store length - text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f)); - text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff); - bytePool.byteUpto += utf8.length + 2; - System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length); - } - - if (numPostings == postingsHashHalfSize) { - rehashPostings(2*postingsHashSize); - bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT); - } - + // nocommit factor this out! + postingsArray.textStarts[termID] = bytesHash.byteStart(termID); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); } - if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) { + if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.nextBuffer(); } @@ -503,6 +264,7 @@ consumer.newTerm(termID); } else { + termID = (-termID)-1; final int intStart = postingsArray.intStarts[termID]; intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK; @@ -518,9 +280,9 @@ void writeByte(int stream, byte b) { int upto = intUptos[intUptoStart+stream]; - byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT]; + byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT]; assert bytes != null; - int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK; + int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK; if (bytes[offset] != 0) { // End of slice; allocate a new one offset = bytePool.allocSlice(bytes, offset); @@ -554,60 +316,4 @@ nextPerField.finish(); } - /** Called when postings hash is too small (> 50% - * occupied) or too large (< 20% occupied). */ - void rehashPostings(final int newSize) { - - final int newMask = newSize-1; - - int[] newHash = new int[newSize]; - Arrays.fill(newHash, -1); - for(int i=0;i> DocumentsWriter.BYTE_BLOCK_SHIFT]; - code = 0; - - final int len; - int pos; - if ((text[start] & 0x80) == 0) { - // length is 1 byte - len = text[start]; - pos = start+1; - } else { - len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7); - pos = start+2; - } - - final int endPos = pos+len; - while(pos < endPos) { - code = (code*31) + text[pos++]; - } - } else { - code = postingsArray.textStarts[termID]; - } - - int hashPos = code & newMask; - assert hashPos >= 0; - if (newHash[hashPos] != -1) { - final int inc = ((code>>8)+code)|1; - do { - code += inc; - hashPos = code & newMask; - } while (newHash[hashPos] != -1); - } - newHash[hashPos] = termID; - } - } - - postingsHashMask = newMask; - postingsHash = newHash; - - postingsHashSize = newSize; - postingsHashHalfSize = newSize >> 1; - } } Index: src/java/org/apache/lucene/index/TermsHashPerThread.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerThread.java (revision 1001293) +++ src/java/org/apache/lucene/index/TermsHashPerThread.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; Index: src/java/org/apache/lucene/util/ByteBlockPool.java =================================================================== --- src/java/org/apache/lucene/util/ByteBlockPool.java (revision 1000834) +++ src/java/org/apache/lucene/util/ByteBlockPool.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.index; +package org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -34,29 +34,56 @@ * hit a non-zero byte. */ import java.util.Arrays; -import org.apache.lucene.util.BytesRef; + import java.util.List; import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; -import org.apache.lucene.util.ArrayUtil; -final class ByteBlockPool { +/** + * + * @lucene.internal + */ +public final class ByteBlockPool { // TODO we need a dedicated test for this class + public final static int BYTE_BLOCK_SHIFT = 15; + public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT; + public final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; - abstract static class Allocator { - abstract void recycleByteBlocks(byte[][] blocks, int start, int end); - abstract void recycleByteBlocks(List blocks); - abstract byte[] getByteBlock(); + public abstract static class Allocator { + protected final int blockSize; + + public Allocator(int blockSize){ + this.blockSize = blockSize; + } + public abstract void recycleByteBlocks(byte[][] blocks, int start, int end); + + public void recycleByteBlocks(List blocks) { + final byte[][] b = blocks.toArray(new byte[blocks.size()][]); + recycleByteBlocks(b, 0, b.length); + } + + public byte[] getByteBlock() { + return new byte[blockSize]; + } + + } public byte[][] buffers = new byte[10][]; int bufferUpto = -1; // Which buffer we are upto - public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer + public int byteUpto = BYTE_BLOCK_SIZE; // Where we are in head buffer public byte[] buffer; // Current head buffer - public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset + public int byteOffset = -BYTE_BLOCK_SIZE; // Current head offset private final Allocator allocator; + /** + * Creates a new {@link ByteBlockPool} + * + * @param allocator + * - an allocator allocating byte blocks of the size + * {@link #BYTE_BLOCK_SIZE} ({@value #BYTE_BLOCK_SIZE}) + */ public ByteBlockPool(Allocator allocator) { this.allocator = allocator; } @@ -95,11 +122,11 @@ bufferUpto++; byteUpto = 0; - byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE; + byteOffset += BYTE_BLOCK_SIZE; } public int newSlice(final int size) { - if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size) + if (byteUpto > BYTE_BLOCK_SIZE-size) nextBuffer(); final int upto = byteUpto; byteUpto += size; @@ -112,9 +139,11 @@ // is just a compact way to encode X+1 with a max. Second // array is the length of each slice, ie first slice is 5 // bytes, next slice is 14 bytes, etc. - final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9}; - final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200}; - final static int FIRST_LEVEL_SIZE = levelSizeArray[0]; + + // nocommit - public arrays are not nice! + public final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9}; + public final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200}; + public final static int FIRST_LEVEL_SIZE = levelSizeArray[0]; public int allocSlice(final byte[] slice, final int upto) { @@ -123,7 +152,7 @@ final int newSize = levelSizeArray[newLevel]; // Maybe allocate another block - if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize) + if (byteUpto > BYTE_BLOCK_SIZE-newSize) nextBuffer(); final int newUpto = byteUpto; @@ -150,9 +179,9 @@ // Fill in a BytesRef from term's length & bytes encoded in // byte block - final BytesRef setBytesRef(BytesRef term, int textStart) { - final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; - int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK; + public final BytesRef setBytesRef(BytesRef term, int textStart) { + final byte[] bytes = term.bytes = buffers[textStart >> BYTE_BLOCK_SHIFT]; + int pos = textStart & BYTE_BLOCK_MASK; if ((bytes[pos] & 0x80) == 0) { // length is 1 byte term.length = bytes[pos]; Index: src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- src/java/org/apache/lucene/util/BytesRef.java (revision 1001293) +++ src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -30,6 +30,7 @@ * @lucene.experimental */ public final class BytesRef implements Comparable, Externalizable { + static final int HASH_PRIME = 31; public static final byte[] EMPTY_BYTES = new byte[0]; /** The contents of the BytesRef. Should never be {@code null}. */ @@ -182,11 +183,10 @@ */ @Override public int hashCode() { - final int prime = 31; int result = 0; final int end = offset + length; for(int i=offset;i) storing the hashed bytes + * efficiently in continuous storage. The mapping to an int value is + * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased + * for each added {@link BytesRef}. + * + *

+ * Note: The maxium capacity {@link BytesRef} instance passed to + * {@link #add(BytesRef)} must not be longer than {@link #BYTES_BLOCK_SIZE}-2 ( + * {@value #BYTES_BLOCK_SIZE}-2. The internal storage is limited to 2GB total + * byte storage. + *

+ * + * @lucene.internal + */ +public final class BytesRefHash { + + private final ByteBlockPool pool; + private int hashSize; + private int hashHalfSize; + private int hashMask; + private int count; + private int lastCount = -1; + private int[] keys; + private int[] values; + public static final int DEFAULT_CAPACITY = 16; + + /** + * Creates a new {@link BytesRefHash} + */ + public BytesRefHash(ByteBlockPool pool) { + this(pool, DEFAULT_CAPACITY); + } + + /** + * Creates a new {@link BytesRefHash} + */ + public BytesRefHash(ByteBlockPool pool, int capacity) { + hashSize = capacity; + hashHalfSize = hashSize >> 1; + hashMask = hashSize - 1; + this.pool = pool; + keys = new int[hashSize]; + Arrays.fill(keys, -1); + values = new int[ArrayUtil + .oversize(hashSize, RamUsageEstimator.NUM_BYTES_INT)]; + } + + /** + * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}. + * + * @return the number of {@link BytesRef} values in this {@link BytesRefHash}. + */ + public int size() { + return count; + } + + /** + * Returns the {@link BytesRef} value for the given key. + *

+ * Note: the given key must be a positive integer less that the current size ( + * {@link #size()}) + *

+ * + * @param key + * the key + * + * @return a BytesRef instance for the given key + */ + public BytesRef get(int key) { + return pool.setBytesRef(scratch1, values[key]); + } + + /** + * Returns the keys array in arbitrary order. Valid keys start at offset of 0 + * and end at a limit of {@link #size()} - 1 + *

+ * Note: This is a destructive operation. {@link #clear()} must be called in + * order to reuse this {@link BytesRefHash} instance. + *

+ */ + public int[] compact() { + //nocommit unused internally - do still need it? + int upto = 0; + for (int i = 0; i < hashSize; i++) { + if (keys[i] != -1) { + if (upto < i) { + keys[upto] = keys[i]; + keys[i] = -1; + } + upto++; + } + } + + assert upto == count; + lastCount = count; + return keys; + } + + /** + * Returns the values array sorted by the referenced byte values. + *

+ * Note: This is a destructive operation. {@link #clear()} must be called in + * order to reuse this {@link BytesRefHash} instance. + *

+ * + * @param comp + * the {@link Comparator} used for sorting + */ + public int[] sort(Comparator comp) { + final int[] compact = compact(); + quickSort(comp, compact, 0, count - 1); + return compact; + } + + private void quickSort(Comparator comp, int[] entries, int lo, + int hi) { + if (lo >= hi) + return; + if (hi == 1 + lo) { + if (compare(comp, entries[lo], entries[hi]) > 0) { + final int tmp = entries[lo]; + entries[lo] = entries[hi]; + entries[hi] = tmp; + } + return; + } + final int mid = (lo + hi) >>> 1; + if (compare(comp, entries[lo], entries[mid]) > 0) { + int tmp = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp; + } + + if (compare(comp, entries[mid], entries[hi]) > 0) { + int tmp = entries[mid]; + entries[mid] = entries[hi]; + entries[hi] = tmp; + + if (compare(comp, entries[lo], entries[mid]) > 0) { + int tmp2 = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp2; + } + } + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + final int partition = entries[mid]; + + for (;;) { + while (compare(comp, entries[right], partition) > 0) + --right; + + while (left < right && compare(comp, entries[left], partition) <= 0) + ++left; + + if (left < right) { + final int tmp = entries[left]; + entries[left] = entries[right]; + entries[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(comp, entries, lo, left); + quickSort(comp, entries, left + 1, hi); + } + + private final BytesRef scratch1 = new BytesRef(); + private final BytesRef scratch2 = new BytesRef(); + + private boolean equals(int key, BytesRef b) { + return pool.setBytesRef(scratch1, values[key]).bytesEquals(b); + } + + private int compare(Comparator comp, int key1, int key2) { + return comp.compare(pool.setBytesRef(scratch1, values[key1]), pool.setBytesRef(scratch2, values[key2])); + } + + private boolean shrink(int targetSize) { + + // Cannot use ArrayUtil.shrink because we require power + // of 2: + int newSize = hashSize; + while (newSize >= 8 && newSize / 4 > targetSize) { + newSize /= 2; + } + if (newSize != hashSize) { + hashSize = newSize; + keys = new int[hashSize]; + Arrays.fill(keys, -1); + hashHalfSize = newSize / 2; + hashMask = newSize - 1; + ArrayUtil.shrink(values, newSize); + Arrays.fill(values, -1); + return true; + } else { + return false; + } + } + + /** + * Clears the {@link BytesRef} and returns an {@link Entry} which maps to the + * given {@link BytesRef} + */ + public void clear(boolean resetPool) { + lastCount = count; + count = 0; + if(resetPool) + pool.reset(); + if (lastCount != -1 && shrink(lastCount)) { + // shrink clears the hash entries + return; + } + Arrays.fill(keys, -1); + Arrays.fill(values, -1); + } + + public void clear() { + clear(true); + } + + /** + * Adds a new {@link BytesRef} + * + * @param bytes + * the bytes to hash + * @return the key the given bytes are hashed if there was no mapping for the + * given bytes, otherwise (-(key)-1). This guarantees + * that the return value will always be >= 0 if the given bytes + * haven't been hashed before. + * + * @throws MaxKeyLengthExceededException + * if the given bytes are > 2 + + * {@link ByteBlockPool#BYTE_BLOCK_SIZE} + */ + public int add(BytesRef bytes) { + return add(bytes, bytes.hashCode()); + } + + /** + * Adds a new {@link BytesRef} with a pre-calculated hash code. + * + * @param bytes + * the bytes to hash + * @param code + * the bytes hash code + * + *

+ * Hashcode is defined as: + * + *

+   * int hash = 0;
+   * for (int i = offset; i < offset + length; i++) {
+   *   hash = 31 * hash + bytes[i];
+   * }
+   * 
+ * + * @return the key the given bytes are hashed if there was no mapping for the + * given bytes, otherwise (-(key)-1). This guarantees + * that the return value will always be >= 0 if the given bytes + * haven't been hashed before. + * + * @throws MaxKeyLengthExceededException + * if the given bytes are > 2 + + * {@link ByteBlockPool#BYTE_BLOCK_SIZE} + */ + public int add(BytesRef bytes, int code) { + final int length = bytes.length; + // final position + int hashPos = code & hashMask; + int e = keys[hashPos]; + if (e != -1 && !equals(e, bytes)) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & hashMask; + e = keys[hashPos]; + } while (e != -1 && !equals(e, bytes)); + } + + if (e == -1) { + // new entry + final int len2 = 2 + bytes.length; + if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) { + if (len2 > BYTE_BLOCK_SIZE) { + throw new MaxKeyLengthExceededException("bytes can be at most " + + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length); + } + pool.nextBuffer(); + } + final byte[] buffer = pool.buffer; + final int bufferUpto = pool.byteUpto; + e = count++; + values[e] = bufferUpto + pool.byteOffset; + + // We first encode the length, followed by the + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + if (length < 128) { + // 1 byte to store length + buffer[bufferUpto] = (byte) length; + pool.byteUpto += length + 1; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, + length); + } else { + // 2 byte to store length + buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f)); + buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff); + pool.byteUpto += length + 2; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, + length); + } + assert keys[hashPos] == -1; + keys[hashPos] = e; + + if (count == hashHalfSize) { + rehash(2 * hashSize, true); + } + return e; + } + return -(e + 1); + } + + public int addByOffset(int offset){ + // final position + int code = offset; + int hashPos = offset & hashMask; + int e = keys[hashPos]; + if (e != -1 && values[e] != offset) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & hashMask; + e = keys[hashPos]; + } while (e != -1 && values[e] != offset); + } + if (e == -1) { + // new entry + e = count++; + values[e] = offset; + assert keys[hashPos] == -1; + keys[hashPos] = e; + + if (count == hashHalfSize) { + rehash(2 * hashSize, false); + } + return e; + } + return -(e + 1); + } + + /** + * Called when hash is too small (> 50% occupied) or too large (< 20% + * occupied). + */ + private void rehash(final int newSize, boolean hashOnData) { + final int newMask = newSize - 1; + final int[] newHash = new int[newSize]; + values = ArrayUtil.grow(values, newSize); + Arrays.fill(newHash, -1); + for (int i = 0; i < hashSize; i++) { + final int e0 = keys[i]; + if (e0 != -1) { + int code; + if(hashOnData) { + final int off = values[e0]; + final int start = off & BYTE_BLOCK_MASK; + final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT]; + code = 0; + final int len; + int pos; + if ((bytes[start] & 0x80) == 0) { + // length is 1 byte + len = bytes[start]; + pos = start + 1; + } else { + len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7); + pos = start + 2; + } + + final int endPos = pos + len; + while (pos < endPos) { + code = BytesRef.HASH_PRIME * code + bytes[pos++]; + } + } else { + code = values[e0]; + } + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != -1) { + final int inc = ((code >> 8) + code) | 1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != -1); + } + newHash[hashPos] = e0; + } + } + + hashMask = newMask; + keys = newHash; + hashSize = newSize; + hashHalfSize = newSize / 2; + } + + public int byteStart(int key) { + assert key >= 0 && key < count : key; + return values[key]; + } + + @SuppressWarnings("serial") + public static class MaxKeyLengthExceededException extends RuntimeException { + MaxKeyLengthExceededException(String message) { + super(message); + } + } +} Property changes on: src/java/org/apache/lucene/util/BytesRefHash.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java =================================================================== --- src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java (revision 0) +++ src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java (revision 0) @@ -0,0 +1,178 @@ +package org.apache.lucene.util; + +import java.util.concurrent.locks.Lock; + +import org.apache.lucene.util.ByteBlockPool.Allocator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link ByteBlockPool.Allocator} implementation that recycles unused byte + * blocks in a buffer and reuses them in subsequent calls to + * {@link #getByteBlock()}. + * + * @lucene.internal + */ +public final class RecyclingByteBlockAllocator extends ByteBlockPool.Allocator { + private byte[][] freeByteBlocks; + private final int maxBufferedBlocks; + private int blocksUsed; + private int pos = 0; + private final Lock lock; + public static final int DEFAULT_BUFFERED_BLOCKS = 64; + + + /** + * Creates a new {@link RecyclingByteBlockAllocator} + * @param blockSize the block size in bytes + * @param maxBufferedBlocks maximum number of buffered byte block + * @param lock a lock to synchronize allocator access + * + * @see DummyConcurrentLock + */ + public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks, + Lock lock) { + super(blockSize); + freeByteBlocks = new byte[Math.min(10, maxBufferedBlocks)][]; + this.maxBufferedBlocks = maxBufferedBlocks; + this.lock = lock; + } + + /** + * Creates a new {@link RecyclingByteBlockAllocator} with a + * {@link DummyConcurrentLock} instance. + * + * @param blockSize the block size in bytes + * @param maxBufferedBlocks maximum number of buffered byte block + */ + public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks) { + this(blockSize, maxBufferedBlocks, DummyConcurrentLock.INSTANCE); + } + + /** + * Creates a new {@link RecyclingByteBlockAllocator} with a block size of + * {@link ByteBlockPool#BYTE_BLOCK_SIZE} ( + * {@value ByteBlockPool#BYTE_BLOCK_SIZE}, upper buffered docs limit of + * {@link #DEFAULT_BUFFERED_BLOCKS} ({@value #DEFAULT_BUFFERED_BLOCKS}) and a + * {@link DummyConcurrentLock} instance. + * + */ + public RecyclingByteBlockAllocator() { + this(ByteBlockPool.BYTE_BLOCK_SIZE, 64, DummyConcurrentLock.INSTANCE); + } + + @Override + public byte[] getByteBlock() { + try { + lock.lock(); + if (pos == 0) { + blocksUsed++; + return new byte[blockSize]; + } + final byte[] b = freeByteBlocks[--pos]; + freeByteBlocks[pos] = null; + return b; + } finally { + lock.unlock(); + } + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + try { + lock.lock(); + final int numBlocks = Math.min(maxBufferedBlocks - pos, end - start); + final int size = pos + numBlocks; + if (size >= freeByteBlocks.length) { + final byte[][] newBlocks = new byte[ArrayUtil.oversize(size, + RamUsageEstimator.NUM_BYTES_OBJ_REF)][]; + System.arraycopy(freeByteBlocks, 0, newBlocks, 0, pos); + freeByteBlocks = newBlocks; + } + final int stop = start + numBlocks; + for (int i = start; i < stop; i++) { + freeByteBlocks[pos++] = blocks[i]; + blocks[i] = null; + } + blocksUsed -= (end - stop); + assert blocksUsed >= 0; + } finally { + lock.unlock(); + } + } + + /** + * @return the number of currently buffered blocks + */ + public int numBufferedBlocks() { + try{ + lock.lock(); + return pos; + }finally{ + lock.unlock(); + } + } + + /** + * @return the number of bytes currently allocated by this {@link Allocator} + */ + public long bytesUsed() { + try{ + lock.lock(); + return blocksUsed * (long)blockSize; + }finally{ + lock.unlock(); + } + } + + /** + * @return the maximum number of buffered byte blocks + */ + public int maxBufferedBlocks() { + return maxBufferedBlocks; + } + + /** + * Removes the given number of byte blocks from the buffer if possible. + * @param num the number of byte blocks to remove + * @return the number of actually removed buffers + */ + public int freeBlocks(int num) { + assert num >= 0; + try { + lock.lock(); + final int stop; + final int count; + if (num > pos) { + stop = 0; + count = pos; + } else { + stop = pos - num; + count = num; + } + while (pos > stop) { + freeByteBlocks[--pos] = null; + } + blocksUsed -= count; + assert blocksUsed >= 0; + return count; + } finally { + lock.unlock(); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/test/org/apache/lucene/index/TestByteSlices.java =================================================================== --- src/test/org/apache/lucene/index/TestByteSlices.java (revision 1001293) +++ src/test/org/apache/lucene/index/TestByteSlices.java (working copy) @@ -14,44 +14,16 @@ * limitations under the License. */ -import java.util.ArrayList; -import java.util.List; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.RecyclingByteBlockAllocator; public class TestByteSlices extends LuceneTestCase { - private static class ByteBlockAllocator extends ByteBlockPool.Allocator { - ArrayList freeByteBlocks = new ArrayList(); - - /* Allocate another byte[] from the shared pool */ - @Override - synchronized byte[] getByteBlock() { - final int size = freeByteBlocks.size(); - final byte[] b; - if (0 == size) - b = new byte[DocumentsWriter.BYTE_BLOCK_SIZE]; - else - b = freeByteBlocks.remove(size-1); - return b; - } - - /* Return a byte[] to the pool */ - @Override - synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { - for(int i=start;i blocks) { - final int size = blocks.size(); - for(int i=0;i strings = new HashMap(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + int count = hash.size(); + int key = hash.add(ref); + if (key >= 0) { + assertNull(strings.put(str, Integer.valueOf(key))); + assertEquals(i, key); + assertEquals(hash.size(), count + 1); + } else { + assertTrue((-key)-1 < count); + assertEquals(hash.size(), count); + } + } + for (Entry entry : strings.entrySet()) { + ref.copy(entry.getKey()); + assertEquals(ref, hash.get(entry.getValue().intValue())); + } + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for {@link org.apache.lucene.util.BytesRefHash#compact()}. + */ + @Test + public void testCompact() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + final int size = 797; + BitSet bits = new BitSet(size); + for (int i = 0; i < size; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + bits.set(hash.add(ref)); + + } + assertEquals(hash.size(), bits.cardinality()); + int[] compact = hash.compact(); + assertTrue(size < compact.length); + for (int i = 0; i < size; i++) { + bits.set(compact[i], false); + } + assertEquals(0, bits.cardinality()); + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for + * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}. + */ + @Test + public void testSort() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + SortedSet strings = new TreeSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + hash.add(ref); + strings.add(str); + } + int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); + assertTrue(strings.size() < sort.length); + int i = 0; + for (String string : strings) { + ref.copy(string); + assertEquals(ref, hash.get(sort[i++])); + } + hash.clear(); + assertEquals(0, hash.size()); + } + } + + /** + * Test method for + * {@link org.apache.lucene.util.BytesRefHash#add(org.apache.lucene.util.BytesRef)} + * . + */ + @Test + public void testAdd() { + BytesRef ref = new BytesRef(); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + Set strings = new HashSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + int count = hash.size(); + int key = hash.add(ref); + + if (key >=0) { + assertTrue(strings.add(str)); + assertEquals(i, key); + assertEquals(hash.size(), count + 1); + } else { + assertFalse(strings.add(str)); + assertTrue((-key)-1 < count); + assertEquals(str, hash.get((-key)-1).utf8ToString()); + assertEquals(count, hash.size()); + } + } + + assertAllIn(strings, hash); + hash.clear(); + assertEquals(0, hash.size()); + } + } + + @Test(expected = MaxKeyLengthExceededException.class) + public void testLargeValue() { + int[] sizes = new int[] { random.nextInt(5), + ByteBlockPool.BYTE_BLOCK_SIZE - 33 + random.nextInt(31), + ByteBlockPool.BYTE_BLOCK_SIZE - 1 + random.nextInt(37) }; + BytesRef ref = new BytesRef(); + for (int i = 0; i < sizes.length; i++) { + ref.bytes = new byte[sizes[i]]; + ref.offset = 0; + ref.length = sizes[i]; + try { + assertEquals(i, hash.add(ref)); + } catch (MaxKeyLengthExceededException e) { + if (i < sizes.length - 1) + fail("unexpected exception at size: " + sizes[i]); + throw e; + } + } + } + + /** + * Test method for + * {@link org.apache.lucene.util.BytesRefHash#addByOffset(int)} + * . + */ + @Test + public void testAddByOffset() { + BytesRef ref = new BytesRef(); + BytesRefHash offsetHash = newHash(pool); + for (int j = 0; j < 2 * RANDOM_MULTIPLIER; j++) { + Set strings = new HashSet(); + for (int i = 0; i < 797; i++) { + String str; + do { + str = _TestUtil.randomRealisticUnicodeString(random, 1000); + } while (str.length() == 0); + ref.copy(str); + int count = hash.size(); + int key = hash.add(ref); + + if (key >= 0) { + assertTrue(strings.add(str)); + assertEquals(i, key); + assertEquals(hash.size(), count + 1); + int offsetKey = offsetHash.addByOffset(hash.byteStart(key)); + assertEquals(i, offsetKey); + assertEquals(offsetHash.size(), count + 1); + } else { + assertFalse(strings.add(str)); + assertTrue((-key)-1 < count); + assertEquals(str, hash.get((-key)-1).utf8ToString()); + assertEquals(count, hash.size()); + int offsetKey = offsetHash.addByOffset(hash.byteStart((-key)-1)); + assertTrue((-offsetKey)-1 < count); + assertEquals(str, hash.get((-offsetKey)-1).utf8ToString()); + assertEquals(count, hash.size()); + } + } + + assertAllIn(strings, hash); + for (String string : strings) { + ref.copy(string); + int key = hash.add(ref); + BytesRef bytesRef = offsetHash.get((-key)-1); + assertEquals(ref, bytesRef); + } + + hash.clear(); + assertEquals(0, hash.size()); + offsetHash.clear(); + assertEquals(0, offsetHash.size()); + + } + } + + private void assertAllIn(Set strings, BytesRefHash hash) { + BytesRef ref = new BytesRef(); + int count = hash.size(); + for (String string : strings) { + ref.copy(string); + int key = hash.add(ref); // add again to check duplicates + assertEquals(string, hash.get((-key)-1).utf8ToString()); + assertEquals(count, hash.size()); + assertTrue("key: " + key + " count: " + count + " string: " + string, + key < count); + } + } + + +} Property changes on: src/test/org/apache/lucene/util/TestBytesRefHash.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java =================================================================== --- src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java (revision 0) +++ src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java (revision 0) @@ -0,0 +1,142 @@ +package org.apache.lucene.util; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.locks.ReentrantLock; + +import org.junit.Before; +import org.junit.Test; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Testcase for {@link RecyclingByteBlockAllocator} + */ +public class TestRecyclingByteBlockAllocator extends LuceneTestCase { + + /** + */ + @Before + public void setUp() throws Exception { + super.setUp(); + } + + private RecyclingByteBlockAllocator newAllocator() { + return new RecyclingByteBlockAllocator(1 << (2 + random.nextInt(15)), + random.nextInt(97), random.nextBoolean()? new ReentrantLock():DummyConcurrentLock.INSTANCE); + } + + @Test + public void testAllocate() { + RecyclingByteBlockAllocator allocator = newAllocator(); + HashSet set = new HashSet(); + byte[] block = allocator.getByteBlock(); + set.add(block); + assertNotNull(block); + final int size = block.length; + + for (int i = 0; i < 97 * RANDOM_MULTIPLIER; i++) { + block = allocator.getByteBlock(); + assertNotNull(block); + assertEquals(size, block.length); + assertTrue("block is returned twice", set.add(block)); + assertEquals(size * (i + 2), allocator.bytesUsed()); // zero based + 1 + assertEquals(0, allocator.numBufferedBlocks()); + } + } + + @Test + public void testAllocateAndRecycle() { + RecyclingByteBlockAllocator allocator = newAllocator(); + HashSet allocated = new HashSet(); + HashSet free = new HashSet(); + + byte[] block = allocator.getByteBlock(); + allocated.add(block); + assertNotNull(block); + final int size = block.length; + + for (int i = 0; i < 97 * RANDOM_MULTIPLIER; i++) { + int num = 1 + random.nextInt(39); + for (int j = 0; j < num; j++) { + block = allocator.getByteBlock(); + free.remove(block); + assertNotNull(block); + assertEquals(size, block.length); + assertTrue("block is returned twice", allocated.add(block)); + assertEquals(size * (allocated.size() + free.size()), allocator + .bytesUsed()); + } + byte[][] array = allocated.toArray(new byte[0][]); + int begin = random.nextInt(array.length); + int end = begin + random.nextInt(array.length - begin); + List selected = new ArrayList(); + for (int j = begin; j < end; j++) { + selected.add(array[j]); + } + allocator.recycleByteBlocks(array, begin, end); + for (int j = begin; j < end; j++) { + byte[] b = selected.remove(0); + assertTrue(allocated.remove(b)); + if (array[j] == null) { // recycled blocks are nulled out + assertTrue(free.add(b)); + } + } + } + } + + @Test + public void testAllocateAndFree() { + RecyclingByteBlockAllocator allocator = newAllocator(); + HashSet allocated = new HashSet(); + int freeButAllocated = 0; + byte[] block = allocator.getByteBlock(); + allocated.add(block); + assertNotNull(block); + final int size = block.length; + + for (int i = 0; i < 97 * RANDOM_MULTIPLIER; i++) { + int num = 1 + random.nextInt(39); + for (int j = 0; j < num; j++) { + block = allocator.getByteBlock(); + freeButAllocated = Math.max(0, freeButAllocated - 1); + assertNotNull(block); + assertEquals(size, block.length); + assertTrue("block is returned twice", allocated.add(block)); + assertEquals(size * (allocated.size() + allocator.numBufferedBlocks()), + allocator.bytesUsed()); + } + + byte[][] array = allocated.toArray(new byte[0][]); + int begin = random.nextInt(array.length); + int end = begin + random.nextInt(array.length - begin); + for (int j = begin; j < end; j++) { + byte[] b = array[j]; + assertTrue(allocated.remove(b)); + } + allocator.recycleByteBlocks(array, begin, end); + // randomly free blocks + int numFreeBlocks = allocator.numBufferedBlocks(); + int freeBlocks = allocator.freeBlocks(random.nextInt(7 + allocator + .maxBufferedBlocks())); + assertEquals(allocator.numBufferedBlocks(), numFreeBlocks - freeBlocks); + } + } +} \ No newline at end of file Property changes on: src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL