Index: src/test/org/apache/lucene/TestSnapshotDeletionPolicy.java =================================================================== --- src/test/org/apache/lucene/TestSnapshotDeletionPolicy.java (revision 664673) +++ src/test/org/apache/lucene/TestSnapshotDeletionPolicy.java (working copy) @@ -121,9 +121,8 @@ try { writer.addDocument(doc); } catch (Throwable t) { - RuntimeException re = new RuntimeException("addDocument failed"); - re.initCause(t); - throw re; + t.printStackTrace(System.out); + fail("addDocument failed"); } } try { Index: src/test/org/apache/lucene/TestDemo.java =================================================================== --- src/test/org/apache/lucene/TestDemo.java (revision 664673) +++ src/test/org/apache/lucene/TestDemo.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; /** * A very simple demo used in the API documentation (src/java/overview.html). @@ -58,6 +59,8 @@ Field.Index.TOKENIZED)); iwriter.addDocument(doc); iwriter.close(); + + _TestUtil.checkIndex(directory); // Now search the index: IndexSearcher isearcher = new IndexSearcher(directory); Index: src/test/org/apache/lucene/search/TestSort.java =================================================================== --- src/test/org/apache/lucene/search/TestSort.java (revision 664673) +++ src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -648,34 +648,35 @@ // make sure the documents returned by the search match the expected list private void assertMatches (Searcher searcher, Query query, Sort sort, String expectedResult) throws IOException { - ScoreDoc[] result = searcher.search (query, null, 1000, sort).scoreDocs; - StringBuffer buff = new StringBuffer(10); - int n = result.length; - for (int i=0; i 0) + state.position += state.analyzer.getPositionIncrementGap(fieldInfo.name); + + if (!field.isTokenized()) { // un-tokenized field + String stringValue = field.stringValue(); + final int valueLength = stringValue.length(); + Token token = perThread.localToken; + token.clear(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < valueLength) + termBuffer = token.resizeTermBuffer(valueLength); + stringValue.getChars(0, valueLength, termBuffer, 0); + token.setTermLength(valueLength); + token.setStartOffset(state.offset); + token.setEndOffset(state.offset + stringValue.length()); + consumer.add(token); + state.offset += stringValue.length(); + state.length++; + state.position++; + } else { // tokenized field + final TokenStream stream; + final TokenStream streamValue = field.tokenStreamValue(); + + if (streamValue != null) + stream = streamValue; + else { + // the field does not have a TokenStream, + // so we have to obtain one from the analyzer + final Reader reader; // find or make Reader + final Reader readerValue = field.readerValue(); + + if (readerValue != null) + reader = readerValue; + else { + String stringValue = field.stringValue(); + if (stringValue == null) + throw new IllegalArgumentException("field must have either TokenStream, String or Reader value"); + perThread.stringReader.init(stringValue); + reader = perThread.stringReader; + } + + // Tokenize field and add to postingTable + stream = state.analyzer.reusableTokenStream(fieldInfo.name, reader); + } + + // reset the TokenStream to the first token + stream.reset(); + + try { + int offsetEnd = state.offset-1; + final Token localToken = perThread.localToken; + for(;;) { + Token token = stream.next(localToken); + if (token == null) break; + state.position += (token.getPositionIncrement() - 1); + consumer.add(token); + state.position++; + offsetEnd = state.offset + token.endOffset(); + if (++state.length >= maxFieldLength) { + if (state.infoStream != null) + state.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); + break; + } + } + state.offset = offsetEnd+1; + } finally { + stream.close(); + } + } + + state.boost *= field.getBoost(); + consumer.end(field, state); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/DocInverterPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/NormsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/NormsWriterPerField.java (revision 0) +++ src/java/org/apache/lucene/index/NormsWriterPerField.java (revision 0) @@ -0,0 +1,80 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.search.Similarity; + +/** Taps into DocInverter, as an InvertedDocEndConsumer, + * which is called at the end of inverting each field. We + * just look at the length for the field (docState.length) + * and record the norm. */ + +class NormsWriterPerField extends InvertedDocEndConsumerPerField implements Comparable { + + final NormsWriterPerThread perThread; + final FieldInfo fieldInfo; + + // Holds all docID/norm pairs we've seen + int[] docIDs = new int[1]; + byte[] norms = new byte[1]; + int upto; + + public void reset() { + + // Shrink back if we are overallocated now: + // nocommit -- verify this doesn't run hot + docIDs = ArrayUtil.shrink(docIDs, upto); + norms = ArrayUtil.shrink(norms, upto); + + upto = 0; + } + + public NormsWriterPerField(final NormsWriterPerThread perThread, final FieldInfo fieldInfo) { + this.perThread = perThread; + this.fieldInfo = fieldInfo; + } + + void start(Fieldable[] fields, int count, DocumentsWriter.DocState docState) { + } + + void abort() { + upto = 0; + } + + public int compareTo(Object o) { + NormsWriterPerField other = (NormsWriterPerField) o; + return fieldInfo.name.compareTo(other.fieldInfo.name); + } + + void finish(DocumentsWriter.DocState docState) { + if (fieldInfo.isIndexed && !fieldInfo.omitNorms) { + if (docIDs.length <= upto) { + docIDs = ArrayUtil.grow(docIDs, 1+upto); + norms = ArrayUtil.grow(norms, 1+upto); + } + docIDs[upto] = docState.docID; + + final float norm = docState.boost * docState.similarity.lengthNorm(fieldInfo.name, docState.length); + norms[upto] = Similarity.encodeNorm(norm); + + upto++; + } + } +} Property changes on: src/java/org/apache/lucene/index/NormsWriterPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java (revision 0) @@ -0,0 +1,26 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +abstract class TermsHashConsumerPerThread { + abstract void startDocument(DocumentsWriter.DocState docState) throws IOException; + abstract DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState docState) throws IOException; + abstract public TermsHashConsumerPerField addField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo); +} Property changes on: src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocConsumerPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocConsumerPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/DocConsumerPerThread.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +abstract class DocConsumerPerThread { + + // NOTE: if this throws a non-aborting exception (for + // example, analyzer produced some number of tokens but + // then hit an exception), then finishDocument is still + // called + abstract void processDocument(DocumentsWriter.DocState state) throws IOException; + + abstract DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState state) throws IOException; + + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/DocConsumerPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java (revision 664673) +++ src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java (working copy) @@ -1,89 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** Used by DocumentsWriter to merge the postings from - * multiple ThreadStates when creating a segment */ -final class DocumentsWriterFieldMergeState { - - DocumentsWriterFieldData field; - - Posting[] postings; - - private Posting p; - char[] text; - int textOffset; - - private int postingUpto = -1; - - ByteSliceReader freq = new ByteSliceReader(); - ByteSliceReader prox = new ByteSliceReader(); - - int docID; - int termFreq; - - boolean nextTerm() throws IOException { - postingUpto++; - if (postingUpto == field.numPostings) - return false; - - p = postings[postingUpto]; - docID = 0; - - text = field.threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - - if (p.freqUpto > p.freqStart) - freq.init(field.threadState.postingsPool, p.freqStart, p.freqUpto); - else - freq.bufferOffset = freq.upto = freq.endIndex = 0; - - prox.init(field.threadState.postingsPool, p.proxStart, p.proxUpto); - - // Should always be true - boolean result = nextDoc(); - assert result; - - return true; - } - - public boolean nextDoc() throws IOException { - if (freq.bufferOffset + freq.upto == freq.endIndex) { - if (p.lastDocCode != -1) { - // Return last doc - docID = p.lastDocID; - termFreq = p.docFreq; - p.lastDocCode = -1; - return true; - } else - // EOF - return false; - } - - final int code = freq.readVInt(); - docID += code >>> 1; - if ((code & 1) != 0) - termFreq = 1; - else - termFreq = freq.readVInt(); - - return true; - } -} Index: src/java/org/apache/lucene/index/FreqProxFieldMergeState.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (revision 0) +++ src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (revision 0) @@ -0,0 +1,97 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +// TODO FI: some of this is "generic" to TermsHash* so we +// should factor it out so other consumers don't have to +// duplicate this code + +/** Used by DocumentsWriter to merge the postings from + * multiple ThreadStates when creating a segment */ +final class FreqProxFieldMergeState { + + final FreqProxTermsWriterPerField field; + final int numPostings; + final CharBlockPool charPool; + final RawPostingList[] postings; + + private FreqProxTermsWriter.PostingList p; + char[] text; + int textOffset; + + private int postingUpto = -1; + + final ByteSliceReader freq = new ByteSliceReader(); + final ByteSliceReader prox = new ByteSliceReader(); + + int docID; + int termFreq; + + public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) { + this.field = field; + this.charPool = field.perThread.termsHashPerThread.charPool; + this.numPostings = field.termsHashPerField.numPostings; + this.postings = field.termsHashPerField.sortPostings(); + } + + boolean nextTerm() throws IOException { + postingUpto++; + if (postingUpto == numPostings) + return false; + + p = (FreqProxTermsWriter.PostingList) postings[postingUpto]; + docID = 0; + + text = charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + + field.termsHashPerField.initReader(freq, p, 0); + field.termsHashPerField.initReader(prox, p, 1); + + // Should always be true + boolean result = nextDoc(); + assert result; + + return true; + } + + public boolean nextDoc() throws IOException { + if (freq.eof()) { + if (p.lastDocCode != -1) { + // Return last doc + docID = p.lastDocID; + termFreq = p.docFreq; + p.lastDocCode = -1; + return true; + } else + // EOF + return false; + } + + final int code = freq.readVInt(); + docID += code >>> 1; + if ((code & 1) != 0) + termFreq = 1; + else + termFreq = freq.readVInt(); + + return true; + } +} Property changes on: src/java/org/apache/lucene/index/FreqProxFieldMergeState.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermsHash.java =================================================================== --- src/java/org/apache/lucene/index/TermsHash.java (revision 0) +++ src/java/org/apache/lucene/index/TermsHash.java (revision 0) @@ -0,0 +1,262 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.util.Map; +import java.util.HashMap; +import java.util.Iterator; +import java.util.HashSet; +import java.util.Arrays; +import java.io.IOException; + +import org.apache.lucene.util.ArrayUtil; + +// nocommit -- make sure we pare back PostingsFreeList if +// that array is too large + +/** This class implements {@link InvertedDocConsumer}, which + * is passed each token produced by the analyzer on each + * field. It stores these tokens in a hash table, and + * allocates separate byte streams per token. Consumers of + * this class, eg {@link FreqProxTermsWriter} and {@link + * TermVectorsTermsWriter}, write their own byte streams + * under each term. + */ + +class TermsHash extends InvertedDocConsumer { + + final TermsHashConsumer consumer; + final TermsHash nextTermsHash; + final int bytesPerPosting; + final int postingsFreeChunk; + final int streamCount; + final DocumentsWriter docWriter; + TermsHash primaryTermsHash; + + RawPostingList[] postingsFreeList = new RawPostingList[1]; + int postingsFreeCount; + int postingsAllocCount; + boolean trackAllocations; + + public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final int streamCount, final TermsHashConsumer consumer, final TermsHash nextTermsHash) { + this.docWriter = docWriter; + this.streamCount = streamCount; + this.consumer = consumer; + this.nextTermsHash = nextTermsHash; + this.trackAllocations = trackAllocations; + + // Why + 4*POINTER_NUM_BYTE below? + // +1: Posting is referenced by postingsFreeList array + // +3: Posting is referenced by hash, which + // targets 25-50% fill factor; approximate this + // as 3X # pointers + bytesPerPosting = consumer.bytesPerPosting() + 4*DocumentsWriter.POINTER_NUM_BYTE + streamCount*DocumentsWriter.INT_NUM_BYTE; + postingsFreeChunk = (int) (DocumentsWriter.BYTE_BLOCK_SIZE / bytesPerPosting); + } + + InvertedDocConsumerPerThread addThread() { + return new TermsHashPerThread(this, nextTermsHash, null); + } + + TermsHashPerThread addThread(TermsHashPerThread primaryPerThread) { + return new TermsHashPerThread(this, nextTermsHash, primaryPerThread); + } + + public void abort() { + consumer.abort(); + reset(); + if (nextTermsHash != null) + nextTermsHash.abort(); + } + + void reset() { + + // nocommit -- should this happen here? because now we + // do this "as we go" + /* + final Iterator threads = parent.getAllThreads().iterator(); + while(threads.hasNext()) { + TermsHashPerThread perThread = (TermsHashPerThread) threads.next(); + perThread.reset(); + } + */ + + // nocommit -- too harsh? + assert postingsFreeCount == postingsAllocCount; + + final int newSize = ArrayUtil.getNextSize(postingsAllocCount); + if (newSize < postingsFreeList.length) { + // nocommit -- make sure not running hot + RawPostingList[] newArray = new RawPostingList[newSize]; + System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount); + postingsFreeList = newArray; + } + } + + void closeDocStore(DocumentsWriter.FlushState state) throws IOException { + consumer.closeDocStore(state); + if (nextTermsHash != null) + nextTermsHash.closeDocStore(state); + } + + void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException { + Map childThreadsAndFields = new HashMap(); + Map nextThreadsAndFields; + + if (nextTermsHash != null) + nextThreadsAndFields = new HashMap(); + else + nextThreadsAndFields = null; + + Iterator it = threadsAndFields.entrySet().iterator(); + while(it.hasNext()) { + + Map.Entry entry = (Map.Entry) it.next(); + + TermsHashPerThread perThread = (TermsHashPerThread) entry.getKey(); + + Collection fields = (Collection) entry.getValue(); + + Iterator fieldsIt = fields.iterator(); + Collection childFields = new HashSet(); + Collection nextChildFields; + + if (nextTermsHash != null) + nextChildFields = new HashSet(); + else + nextChildFields = null; + + while(fieldsIt.hasNext()) { + TermsHashPerField perField = (TermsHashPerField) fieldsIt.next(); + childFields.add(perField.consumer); + if (nextTermsHash != null) + nextChildFields.add(perField.nextPerField); + } + + childThreadsAndFields.put(perThread.consumer, childFields); + if (nextTermsHash != null) + nextThreadsAndFields.put(perThread.nextPerThread, nextChildFields); + } + + consumer.flush(childThreadsAndFields, state); + + if (nextTermsHash != null) + nextTermsHash.flush(nextThreadsAndFields, state); + + reset(); + } + + synchronized public boolean freeRAM() { + + if (!trackAllocations) + return false; + + boolean any; + final int numToFree; + if (postingsFreeCount >= postingsFreeChunk) + numToFree = postingsFreeChunk; + else + numToFree = postingsFreeCount; + any = numToFree > 0; + if (any) { + Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null); + postingsFreeCount -= numToFree; + postingsAllocCount -= numToFree; + docWriter.bytesAllocated(-numToFree * bytesPerPosting); + any = true; + } + + if (nextTermsHash != null) + any |= nextTermsHash.freeRAM(); + + return any; + } + + // USE ONLY FOR DEBUGGING! + /* + public String getPostingText() { + char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; + int upto = p.textStart & CHAR_BLOCK_MASK; + while(text[upto] != 0xffff) + upto++; + return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK)); + } + */ + + synchronized public void recyclePostings(final RawPostingList[] postings, final int numPostings) { + + assert postings.length >= numPostings; + + // Move all Postings from this ThreadState back to our + // free list. We pre-allocated this array while we were + // creating Postings to make sure it's large enough + + // nocommit -- remove this + for(int i=0;i= 0; + assert start + numToCopy <= postingsFreeList.length; + assert numToCopy <= postings.length; + System.arraycopy(postingsFreeList, start, + postings, 0, numToCopy); + postingsFreeCount -= numToCopy; + + // Directly allocate the remainder if any + if (numToCopy < postings.length) { + final int extra = postings.length - numToCopy; + final int newPostingsAllocCount = postingsAllocCount + extra; + + if (newPostingsAllocCount > postingsFreeList.length) + postingsFreeList = new RawPostingList[ArrayUtil.getNextSize(newPostingsAllocCount)]; + postingsAllocCount += extra; + + // nocommit -- understand what OOM here will do + consumer.createPostings(postings, numToCopy, extra); + + // nocommit -- we used to do this one at a time in the + // loop below + if (trackAllocations) + docWriter.bytesAllocated(extra * bytesPerPosting); + } + + // nocommit -- instead of calling sync method every time + // we grab bytes, just gather up the totals per-thread + // non-sync, then in one place that's already sync'd on + // DW we can sum in our totals + if (trackAllocations) + docWriter.bytesUsed(postings.length * bytesPerPosting); + } +} Property changes on: src/java/org/apache/lucene/index/TermsHash.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FieldsWriter.java (revision 664673) +++ src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -112,6 +112,10 @@ doClose = false; } + void setFieldsStream(IndexOutput stream) { + this.fieldsStream = stream; + } + // Writes the contents of buffer into the fields stream // and adds a new entry for this document into the index // stream. This assumes the buffer was already written @@ -129,10 +133,37 @@ final void close() throws IOException { if (doClose) { - if (fieldsStream != null) - fieldsStream.close(); - if (indexStream != null) - indexStream.close(); + + try { + if (fieldsStream != null) { + try { + fieldsStream.close(); + } finally { + fieldsStream = null; + } + } + } catch (IOException ioe) { + try { + if (indexStream != null) { + try { + indexStream.close(); + } finally { + indexStream = null; + } + } + } catch (IOException ioe2) { + // Ignore so we throw only first IOException hit + } + throw ioe; + } finally { + if (indexStream != null) { + try { + indexStream.close(); + } finally { + indexStream = null; + } + } + } } } Index: src/java/org/apache/lucene/index/ByteSliceReader.java =================================================================== --- src/java/org/apache/lucene/index/ByteSliceReader.java (revision 664673) +++ src/java/org/apache/lucene/index/ByteSliceReader.java (working copy) @@ -39,7 +39,9 @@ public void init(ByteBlockPool pool, int startIndex, int endIndex) { - assert endIndex-startIndex > 0; + assert endIndex-startIndex >= 0; + assert startIndex >= 0; + assert endIndex >= 0; this.pool = pool; this.endIndex = endIndex; @@ -59,9 +61,14 @@ limit = upto+firstSize-4; } + public boolean eof() { + assert upto + bufferOffset <= endIndex; + return upto + bufferOffset == endIndex; + } + public byte readByte() { - // Assert that we are not @ EOF - assert upto + bufferOffset < endIndex; + assert !eof(); + assert upto <= limit; if (upto == limit) nextSlice(); return buffer[upto++]; Index: src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 0) +++ src/java/org/apache/lucene/index/TermsHashPerField.java (revision 0) @@ -0,0 +1,498 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.util.UnicodeUtil; + +class TermsHashPerField extends InvertedDocConsumerPerField { + + final TermsHashConsumerPerField consumer; + final TermsHashPerField nextPerField; + final TermsHashPerThread perThread; + + final int streamCount; + final int numPostingInt; + + final FieldInfo fieldInfo; + + boolean postingsCompacted; + int position; + int numPostings; + int postingsHashSize = 4; + int postingsHashHalfSize = postingsHashSize/2; + int postingsHashMask = postingsHashSize-1; + RawPostingList[] postingsHash = new RawPostingList[postingsHashSize]; + RawPostingList p; + + public TermsHashPerField(final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { + this.perThread = perThread; + streamCount = perThread.termsHash.streamCount; + numPostingInt = 2*streamCount; + this.consumer = perThread.consumer.addField(this, fieldInfo); + this.fieldInfo = fieldInfo; + if (nextPerThread != null) + nextPerField = (TermsHashPerField) nextPerThread.addField(fieldInfo); + else + nextPerField = null; + } + + public void reset() { + if (!postingsCompacted) + compactPostings(); + assert numPostings <= postingsHash.length; + perThread.termsHash.recyclePostings(postingsHash, numPostings); + Arrays.fill(postingsHash, 0, postingsHash.length, null); + postingsCompacted = false; + numPostings = 0; + if (nextPerField != null) + nextPerField.reset(); + } + + public void abort() { + reset(); + if (nextPerField != null) + nextPerField.abort(); + } + + public void initReader(ByteSliceReader reader, RawPostingList p, int stream) { + assert stream < streamCount; + final int[] ints = perThread.intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; + final int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK; + reader.init(perThread.bytePool, + p.byteStart+stream*ByteBlockPool.FIRST_LEVEL_SIZE, + ints[upto+stream]); + } + + private void compactPostings() { + int upto = 0; + for(int i=0;i= hi) + return; + else if (hi == 1+lo) { + if (comparePostings(postings[lo], postings[hi]) > 0) { + final RawPostingList tmp = postings[lo]; + postings[lo] = postings[hi]; + postings[hi] = tmp; + } + return; + } + + int mid = (lo + hi) >>> 1; + + if (comparePostings(postings[lo], postings[mid]) > 0) { + RawPostingList tmp = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp; + } + + if (comparePostings(postings[mid], postings[hi]) > 0) { + RawPostingList tmp = postings[mid]; + postings[mid] = postings[hi]; + postings[hi] = tmp; + + if (comparePostings(postings[lo], postings[mid]) > 0) { + RawPostingList tmp2 = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + RawPostingList partition = postings[mid]; + + for (; ;) { + while (comparePostings(postings[right], partition) > 0) + --right; + + while (left < right && comparePostings(postings[left], partition) <= 0) + ++left; + + if (left < right) { + RawPostingList tmp = postings[left]; + postings[left] = postings[right]; + postings[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(postings, lo, left); + quickSort(postings, left + 1, hi); + } + + /** Compares term text for two Posting instance and + * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ + int comparePostings(RawPostingList p1, RawPostingList p2) { + + // nocommit -- understand why we ever see p1 == p1 + if (p1 == p2) + return 0; + + final char[] text1 = perThread.charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + final char[] text2 = perThread.charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + + assert text1 != text2 || pos1 != pos2; + + while(true) { + final char c1 = text1[pos1++]; + final char c2 = text2[pos2++]; + if (c1 != c2) { + if (0xffff == c2) + return 1; + else if (0xffff == c1) + return -1; + else + return c1-c2; + } else + // This method should never compare equal postings + // unless p1==p2 + assert c1 != 0xffff; + } + } + + /** Test whether the text for current RawPostingList p equals + * current tokenText. */ + private boolean postingEquals(final char[] tokenText, final int tokenTextLen) { + + final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + assert text != null; + int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + + int tokenPos = 0; + for(;tokenPos>8)+code)|1; + do { + code += inc; + hashPos = code & postingsHashMask; + p = postingsHash[hashPos]; + } while (p != null && p.textStart != textStart); + } + + addToken(t, p, hashPos, textStart); + } + + // Primary entry point (for first TermsHash) + void add(Token token) throws IOException { + + assert !postingsCompacted; + + // We are first in the chain so we must "intern" the + // term text into textStart address + + // Get the text of this term. + final char[] tokenText = token.termBuffer(); + final int tokenTextLen = token.termLength(); + + // Compute hashcode & replace any invalid UTF16 sequences + int downto = tokenTextLen; + int code = 0; + while (downto > 0) { + char ch = tokenText[--downto]; + + if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { + if (0 == downto) { + // Unpaired + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } else { + final char ch2 = tokenText[downto-1]; + if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { + // OK: high followed by low. This is a valid + // surrogate pair. + code = ((code*31) + ch)*31+ch2; + downto--; + continue; + } else { + // Unpaired + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } + } + } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) + // Unpaired + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + + code = (code*31) + ch; + } + + int hashPos = code & postingsHashMask; + + // Locate RawPostingList in hash + p = postingsHash[hashPos]; + + if (p != null && !postingEquals(tokenText, tokenTextLen)) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & postingsHashMask; + p = postingsHash[hashPos]; + } while (p != null && !postingEquals(tokenText, tokenTextLen)); + } + + addToken(token, p, hashPos, -1); + } + + void addToken(Token token, RawPostingList p, int hashPos, int textStart) throws IOException { + + final CharBlockPool charPool = perThread.charPool; + + if (p == null) { + + // Get the text of this term. + final char[] tokenText = token.termBuffer(); + final int tokenTextLen = token.termLength(); + + // First time we are seeing this token since we last + // flushed the hash. + final int textLen1 = 1+tokenTextLen; + if (perThread.primary) { + if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { + if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { + // Just skip this term, to remain as robust as + // possible during indexing. A TokenFilter + // can be inserted into the analyzer chain if + // other behavior is wanted (pruning the term + // to a prefix, throwing an exception, etc). + + if (perThread.maxTermPrefix == null) + perThread.maxTermPrefix = new String(tokenText, 0, 30); + + consumer.skippingLongTerm(token); + return; + } + charPool.nextBuffer(); + } + } + + // Refill? + if (0 == perThread.freePostingsCount) + perThread.morePostings(); + + // Pull next free RawPostingList from free list + p = perThread.freePostings[--perThread.freePostingsCount]; + assert p != null; + + if (perThread.primary) { + final char[] text = charPool.buffer; + final int textUpto = charPool.charUpto; + p.textStart = textUpto + charPool.charOffset; + charPool.charUpto += textLen1; + System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); + text[textUpto+tokenTextLen] = 0xffff; + } else + p.textStart = textStart; + + assert postingsHash[hashPos] == null; + postingsHash[hashPos] = p; + numPostings++; + + if (numPostings == postingsHashHalfSize) + rehashPostings(2*postingsHashSize); + + // Init stream slices + if (numPostingInt + perThread.intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) + perThread.intPool.nextBuffer(); + + if (DocumentsWriter.BYTE_BLOCK_SIZE - perThread.bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + perThread.bytePool.nextBuffer(); + + intUptos = perThread.intPool.buffer; + intUptoStart = perThread.intPool.intUpto; + perThread.intPool.intUpto += streamCount; + + p.intStart = intUptoStart + perThread.intPool.intOffset; + + for(int i=0;i> DocumentsWriter.INT_BLOCK_SHIFT]; + intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; + try { + consumer.addTerm(token, p); + } catch (Throwable t) { + throw new AbortException(t, perThread.termsHash.docWriter); + } + } + + if (nextPerField != null && doNextCall) + nextPerField.add(token, p.textStart); + + position++; + } + + int[] intUptos; + int intUptoStart; + + void writeByte(int stream, byte b) { + int upto = intUptos[intUptoStart+stream]; + byte[] bytes = perThread.bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT]; + assert bytes != null; + int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK; + if (bytes[offset] != 0) { + // End of slice; allocate a new one + offset = perThread.bytePool.allocSlice(bytes, offset); + bytes = perThread.bytePool.buffer; + intUptos[intUptoStart+stream] = offset + perThread.bytePool.byteOffset; + } + bytes[offset] = b; + (intUptos[intUptoStart+stream])++; + } + + public void writeBytes(int stream, byte[] b, int offset, int len) { + // TODO: optimize + final int end = offset + len; + for(int i=offset;i>>= 7; + } + writeByte(stream, (byte) i); + } + + void finish(DocumentsWriter.DocState docState) throws IOException { + consumer.finish(docState); + if (nextPerField != null) + nextPerField.finish(docState); + } + + /** Called when postings hash is too small (> 50% + * occupied) or too large (< 20% occupied). */ + void rehashPostings(final int newSize) { + + final int newMask = newSize-1; + + RawPostingList[] newHash = new RawPostingList[newSize]; + for(int i=0;i> DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos = start; + while(text[pos] != 0xffff) + pos++; + code = 0; + while (pos > start) + code = (code*31) + text[--pos]; + } else + code = p0.textStart; + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != null) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != null); + } + newHash[hashPos] = p0; + } + } + + postingsHashMask = newMask; + postingsHash = newHash; + postingsHashSize = newSize; + postingsHashHalfSize = newSize >> 1; + } +} Property changes on: src/java/org/apache/lucene/index/TermsHashPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldProcessor.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessor.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldProcessor.java (revision 0) @@ -0,0 +1,77 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.HashMap; +import java.util.Iterator; + +/** + * This is a DocConsumer that gathers all fields under the + * same name, and calls per-field consumers to process field + * by field. This class doesn't doesn't do any "real" work + * of its own: it just forwards the fields to a + * DocFieldConsumer. + */ + +class DocFieldProcessor extends DocConsumer { + + final DocumentsWriter docWriter; + final FieldInfos fieldInfos; + final DocFieldConsumer consumer; + + public DocFieldProcessor(DocumentsWriter docWriter, DocFieldConsumer consumer) { + this.docWriter = docWriter; + docWriter.fieldInfos = this.fieldInfos = new FieldInfos(); + this.consumer = consumer; + } + + public void closeDocStore(DocumentsWriter.FlushState state) throws IOException { + consumer.closeDocStore(state); + } + + public void flush(Collection threads, DocumentsWriter.FlushState state) throws IOException { + fieldInfos.write(state.directory, state.segmentName + ".fnm"); + // nocommit -- not clean + state.fieldInfos = fieldInfos; + + Map childThreadsAndFields = new HashMap(); + Iterator it = threads.iterator(); + while(it.hasNext()) { + DocFieldProcessorPerThread perThread = (DocFieldProcessorPerThread) it.next(); + childThreadsAndFields.put(perThread.consumer, perThread.fields()); + perThread.trimFields(state); + } + + consumer.flush(childThreadsAndFields, state); + } + + public void abort() { + consumer.abort(); + } + + public boolean freeRAM() { + return consumer.freeRAM(); + } + + public DocConsumerPerThread addThread() throws IOException { + return new DocFieldProcessorPerThread(this); + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldProcessor.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 0) @@ -0,0 +1,353 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.UnicodeUtil; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import java.util.ArrayList; +import java.util.List; +import java.util.Iterator; + +// TODO: break into separate freq and prox writers +class FreqProxTermsWriter extends TermsHashConsumer { + + public FreqProxTermsWriter() { + } + + public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) { + return new FreqProxTermsWriterPerThread(perThread); + } + + void createPostings(RawPostingList[] postings, int start, int count) { + final int end = start + count; + for(int i=start;i 0) + allFields.add(perField); + } + } + + // Sort by field name + Collections.sort(allFields); + final int numAllFields = allFields.size(); + + final TermInfosWriter termsOut = new TermInfosWriter(state.directory, + state.segmentName, + state.fieldInfos, + state.docWriter.writer.getTermIndexInterval()); + final IndexOutput freqOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.FREQ_EXTENSION)); + final IndexOutput proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION)); + + final DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, + termsOut.maxSkipLevels, + state.numDocsInRAM, freqOut, proxOut); + + int start = 0; + while(start < numAllFields) { + final String fieldName = ((FreqProxTermsWriterPerField) allFields.get(start)).fieldInfo.name; + + int end = start+1; + while(end < numAllFields && ((FreqProxTermsWriterPerField) allFields.get(end)).fieldInfo.name.equals(fieldName)) + end++; + + FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start]; + for(int i=start;i + // IndexOutput + while(numBytes > 0) { + final int chunk; + if (numBytes > 4096) + chunk = 4096; + else + chunk = (int) numBytes; + srcIn.readBytes(copyByteBuffer, 0, chunk); + destIn.writeBytes(copyByteBuffer, 0, chunk); + numBytes -= chunk; + } + } + + /* Walk through all unique text tokens (Posting + * instances) found in this field and serialize them + * into a single RAM segment. */ + void appendPostings(final DocumentsWriter.FlushState flushState, + FreqProxTermsWriterPerField[] fields, + TermInfosWriter termsOut, + IndexOutput freqOut, + IndexOutput proxOut, + DefaultSkipListWriter skipListWriter) + throws CorruptIndexException, IOException { + + final int fieldNumber = fields[0].fieldInfo.number; + int numFields = fields.length; + + final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; + + for(int i=0;i 0) { + + // Get the next term to merge + termStates[0] = mergeStates[0]; + int numToMerge = 1; + + for(int i=1;i 0) { + + if ((++df % skipInterval) == 0) { + skipListWriter.setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + FreqProxFieldMergeState minState = termStates[0]; + for(int i=1;i lastDoc || df == 1; + + final int newDocCode = (doc-lastDoc)<<1; + + lastDoc = doc; + + final ByteSliceReader prox = minState.prox; + + // Carefully copy over the prox + payload info, + // changing the format to match Lucene's segment + // format. + for(int j=0;j 0) + copyBytes(prox, proxOut, payloadLength); + } else { + assert 0 == (code & 1); + proxOut.writeVInt(code>>1); + } + } + + if (1 == termDocFreq) { + freqOut.writeVInt(newDocCode|1); + } else { + freqOut.writeVInt(newDocCode); + freqOut.writeVInt(termDocFreq); + } + + if (!minState.nextDoc()) { + + // Remove from termStates + int upto = 0; + for(int i=0;i 0; + + // Done merging this term + + long skipPointer = skipListWriter.writeSkip(freqOut); + + // Write term + termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); + + // TODO: we could do this incrementally + UnicodeUtil.UTF16toUTF8(text, start, termsUTF8); + + // TODO: we could save O(n) re-scan of the term by + // computing the shared prefix with the last term + // while during the UTF8 encoding + termsOut.add(fieldNumber, + termsUTF8.result, + termsUTF8.length, + termInfo); + } + } + + private final TermInfo termInfo = new TermInfo(); // minimize consing + + final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result(); + + void files(Collection files) {} + + static final class PostingList extends RawPostingList { + int docFreq; // # times this term occurs in the current doc + int lastDocID; // Last docID where this term occurred + int lastDocCode; // Code for prior doc + int lastPosition; // Last position where this term occurred + } + + int bytesPerPosting() { + return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE; + } +} Property changes on: src/java/org/apache/lucene/index/FreqProxTermsWriter.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/InvertedDocConsumer.java =================================================================== --- src/java/org/apache/lucene/index/InvertedDocConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/InvertedDocConsumer.java (revision 0) @@ -0,0 +1,29 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.io.IOException; + +abstract class InvertedDocConsumer { + abstract InvertedDocConsumerPerThread addThread(); + abstract void abort(); + abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException; + abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException; + abstract boolean freeRAM(); +} Property changes on: src/java/org/apache/lucene/index/InvertedDocConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/CharBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/CharBlockPool.java (revision 664673) +++ src/java/org/apache/lucene/index/CharBlockPool.java (working copy) @@ -23,11 +23,11 @@ int numBuffer; int bufferUpto = -1; // Which buffer we are upto - public int byteUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer + public int charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer public char[] buffer; // Current head buffer - public int byteOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset - private DocumentsWriter docWriter; + public int charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset + final private DocumentsWriter docWriter; public CharBlockPool(DocumentsWriter docWriter) { this.docWriter = docWriter; @@ -36,8 +36,8 @@ public void reset() { docWriter.recycleCharBlocks(buffers, 1+bufferUpto); bufferUpto = -1; - byteUpto = DocumentsWriter.CHAR_BLOCK_SIZE; - byteOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; + charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; + charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; } public void nextBuffer() { @@ -49,8 +49,8 @@ buffer = buffers[1+bufferUpto] = docWriter.getCharBlock(); bufferUpto++; - byteUpto = 0; - byteOffset += DocumentsWriter.CHAR_BLOCK_SIZE; + charUpto = 0; + charOffset += DocumentsWriter.CHAR_BLOCK_SIZE; } } Index: src/java/org/apache/lucene/index/DocFieldProcessorPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessorPerField.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldProcessorPerField.java (revision 0) @@ -0,0 +1,45 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Fieldable; + +/** + * Holds all per thread, per field state. + */ + +class DocFieldProcessorPerField { + + final DocFieldConsumerPerField consumer; + final FieldInfo fieldInfo; + + DocFieldProcessorPerField next; + int lastGen = -1; + + int fieldCount; + Fieldable[] fields = new Fieldable[1]; + + public DocFieldProcessorPerField(final DocFieldProcessorPerThread perThread, final FieldInfo fieldInfo) { + this.consumer = perThread.consumer.addField(fieldInfo); + this.fieldInfo = fieldInfo; + } + + public void abort() { + consumer.abort(); + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldProcessorPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldConsumer.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumer.java (revision 0) @@ -0,0 +1,29 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; + +abstract class DocFieldConsumer { + abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException; + abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException; + abstract void abort(); + abstract DocFieldConsumerPerThread addThread() throws IOException; + abstract boolean freeRAM(); +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 0) +++ src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 0) @@ -0,0 +1,117 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.analysis.Token; + +// TODO: break into separate freq and prox writers as +// codecs; make separate container (tii/tis/skip/*) that can +// be configured as any number of files 1..N +class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implements Comparable { + + final FreqProxTermsWriterPerThread perThread; + final TermsHashPerField termsHashPerField; + final FieldInfo fieldInfo; + DocumentsWriter.DocState docState; + + public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) { + this.termsHashPerField = termsHashPerField; + this.perThread = perThread; + this.fieldInfo = fieldInfo; + } + + void finish(DocumentsWriter.DocState docState) {} + + boolean doNext; + + void skippingLongTerm(Token t) throws IOException {} + + public int compareTo(Object other0) { + FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0; + return fieldInfo.name.compareTo(other.fieldInfo.name); + } + + boolean start(Fieldable[] fields, int count, DocumentsWriter.DocState docState) { + this.docState = docState; + for(int i=0;i 0) { + termsHashPerField.writeVInt(1, (proxCode<<1)|1); + termsHashPerField.writeVInt(1, payload.length); + termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length); + // nocommit -- not thread safe (JMM)? + fieldInfo.storePayloads = true; + } else + termsHashPerField.writeVInt(1, proxCode<<1); + p.lastPosition = docState.position; + } + + final void newTerm(Token t, RawPostingList p0) { + // First time we're seeing this term since the last + // flush + FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; + p.lastDocCode = docState.docID << 1; + p.lastDocID = docState.docID; + p.docFreq = 1; + writeProx(t, p, docState.position); + } + + final void addTerm(Token t, RawPostingList p0) { + + FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; + + assert p.docFreq > 0; + + if (docState.docID != p.lastDocID) { + // Term not yet seen in the current doc but previously + // seen in other doc(s) since the last flush + + // Now that we know doc freq for previous doc, + // write it & lastDocCode + if (1 == p.docFreq) + termsHashPerField.writeVInt(0, p.lastDocCode|1); + else { + termsHashPerField.writeVInt(0, p.lastDocCode); + termsHashPerField.writeVInt(0, p.docFreq); + } + p.docFreq = 1; + p.lastDocCode = (docState.docID - p.lastDocID) << 1; + p.lastDocID = docState.docID; + writeProx(t, p, docState.position); + } else { + p.docFreq++; + writeProx(t, p, docState.position-p.lastPosition); + } + } + + public void abort() {} +} + Property changes on: src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocumentsWriterFieldData.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterFieldData.java (revision 664673) +++ src/java/org/apache/lucene/index/DocumentsWriterFieldData.java (working copy) @@ -1,682 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.UnicodeUtil; -import java.io.IOException; -import java.io.Reader; -import java.util.Arrays; - -/** Used by DocumentsWriter to hold data associated with a - * single field in a single ThreadState, including the - * Postings hash. A document may have many occurrences for - * a given field name; we gather all such occurrences here - * (in docFields) so that we can process the entire field - * at once. */ - -final class DocumentsWriterFieldData implements Comparable { - - final DocumentsWriterThreadState threadState; - FieldInfo fieldInfo; - - int fieldCount; - Fieldable[] docFields = new Fieldable[1]; - - int lastGen = -1; - DocumentsWriterFieldData next; - - boolean doNorms; - boolean doVectors; - boolean doVectorPositions; - boolean doVectorOffsets; - boolean postingsCompacted; - - int numPostings; - - Posting[] postingsHash; - int postingsHashSize; - int postingsHashHalfSize; - int postingsHashMask; - - int position; - int length; - int offset; - float boost; - int postingsVectorsUpto; - - final ByteSliceWriter sliceWriter; - final ByteSliceWriter vectorsSliceWriter; - - public DocumentsWriterFieldData(DocumentsWriterThreadState threadState, FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - this.threadState = threadState; - sliceWriter = new ByteSliceWriter(threadState.postingsPool); - vectorsSliceWriter = new ByteSliceWriter(threadState.vectorsPool); - } - - void resetPostingArrays() { - if (!postingsCompacted) - compactPostings(); - threadState.docWriter.recyclePostings(this.postingsHash, numPostings); - Arrays.fill(postingsHash, 0, postingsHash.length, null); - postingsCompacted = false; - numPostings = 0; - } - - void initPostingArrays() { - // Target hash fill factor of <= 50% - // NOTE: must be a power of two for hash collision - // strategy to work correctly - postingsHashSize = 4; - postingsHashHalfSize = 2; - postingsHashMask = postingsHashSize-1; - postingsHash = new Posting[postingsHashSize]; - } - - public int compareTo(Object o) { - return fieldInfo.name.compareTo(((DocumentsWriterFieldData) o).fieldInfo.name); - } - - private void compactPostings() { - int upto = 0; - for(int i=0;i 0) { - try { - if (doWriteVectors) { - // Add term vectors for this field - boolean success = false; - try { - writeVectors(fieldInfo); - success = true; - } finally { - if (!success) { - // If we hit an exception inside - // writeVectors, the contents of tvfLocal - // can be corrupt, so we must discard all - // term vectors for this document: - threadState.numVectorFields = 0; - threadState.tvfLocal.reset(); - } - } - } - } finally { - if (postingsVectorsUpto > threadState.maxPostingsVectors) - threadState.maxPostingsVectors = postingsVectorsUpto; - postingsVectorsUpto = 0; - threadState.vectorsPool.reset(); - } - } - } - } - - int offsetEnd; - Token localToken = new Token(); - - /* Invert one occurrence of one field in the document */ - public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException, AbortException { - - if (length>0) - position += analyzer.getPositionIncrementGap(fieldInfo.name); - - if (!field.isTokenized()) { // un-tokenized field - String stringValue = field.stringValue(); - final int valueLength = stringValue.length(); - Token token = localToken; - token.clear(); - char[] termBuffer = token.termBuffer(); - if (termBuffer.length < valueLength) - termBuffer = token.resizeTermBuffer(valueLength); - stringValue.getChars(0, valueLength, termBuffer, 0); - token.setTermLength(valueLength); - token.setStartOffset(offset); - token.setEndOffset(offset + stringValue.length()); - addPosition(token); - offset += stringValue.length(); - length++; - } else { // tokenized field - final TokenStream stream; - final TokenStream streamValue = field.tokenStreamValue(); - - if (streamValue != null) - stream = streamValue; - else { - // the field does not have a TokenStream, - // so we have to obtain one from the analyzer - final Reader reader; // find or make Reader - final Reader readerValue = field.readerValue(); - - if (readerValue != null) - reader = readerValue; - else { - String stringValue = field.stringValue(); - if (stringValue == null) - throw new IllegalArgumentException("field must have either TokenStream, String or Reader value"); - threadState.stringReader.init(stringValue); - reader = threadState.stringReader; - } - - // Tokenize field and add to postingTable - stream = analyzer.reusableTokenStream(fieldInfo.name, reader); - } - - // reset the TokenStream to the first token - stream.reset(); - - try { - offsetEnd = offset-1; - for(;;) { - Token token = stream.next(localToken); - if (token == null) break; - position += (token.getPositionIncrement() - 1); - addPosition(token); - if (++length >= maxFieldLength) { - if (threadState.docWriter.infoStream != null) - threadState.docWriter.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); - break; - } - } - offset = offsetEnd+1; - } finally { - stream.close(); - } - } - - boost *= field.getBoost(); - } - - /** Only called when term vectors are enabled. This - * is called the first time we see a given term for - * each document, to allocate a PostingVector - * instance that is used to record data needed to - * write the posting vectors. */ - private PostingVector addNewVector() { - - if (postingsVectorsUpto == threadState.postingsVectors.length) { - final int newSize; - if (threadState.postingsVectors.length < 2) - newSize = 2; - else - newSize = (int) (1.5*threadState.postingsVectors.length); - PostingVector[] newArray = new PostingVector[newSize]; - System.arraycopy(threadState.postingsVectors, 0, newArray, 0, threadState.postingsVectors.length); - threadState.postingsVectors = newArray; - } - - p.vector = threadState.postingsVectors[postingsVectorsUpto]; - if (p.vector == null) - p.vector = threadState.postingsVectors[postingsVectorsUpto] = new PostingVector(); - - postingsVectorsUpto++; - - final PostingVector v = p.vector; - v.p = p; - - if (doVectorPositions) { - final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); - v.posStart = v.posUpto = threadState.vectorsPool.byteOffset + upto; - } - - if (doVectorOffsets) { - final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); - v.offsetStart = v.offsetUpto = threadState.vectorsPool.byteOffset + upto; - } - - return v; - } - - int offsetStartCode; - int offsetStart; - - // Current posting we are working on - Posting p; - PostingVector vector; - - /** Test whether the text for current Posting p equals - * current tokenText. */ - boolean postingEquals(final char[] tokenText, final int tokenTextLen) { - - final char[] text = threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - assert text != null; - int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - - int tokenPos = 0; - for(;tokenPos 0) { - char ch = tokenText[--downto]; - - if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { - if (0 == downto) { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } else { - final char ch2 = tokenText[downto-1]; - if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { - // OK: high followed by low. This is a valid - // surrogate pair. - code = ((code*31) + ch)*31+ch2; - downto--; - continue; - } else { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } - } - } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - - code = (code*31) + ch; - } - - // System.out.println(" addPosition: field=" + fieldInfo.name + " buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); - - int hashPos = code & postingsHashMask; - - assert !postingsCompacted; - - // Locate Posting in hash - p = postingsHash[hashPos]; - - if (p != null && !postingEquals(tokenText, tokenTextLen)) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code>>8)+code)|1; - do { - code += inc; - hashPos = code & postingsHashMask; - p = postingsHash[hashPos]; - } while (p != null && !postingEquals(tokenText, tokenTextLen)); - } - - final int proxCode; - - // If we hit an exception below, it's possible the - // posting list or term vectors data will be - // partially written and thus inconsistent if - // flushed, so we have to abort all documents - // since the last flush: - - try { - - if (p != null) { // term seen since last flush - - if (threadState.docID != p.lastDocID) { // term not yet seen in this doc - - // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); - - assert p.docFreq > 0; - - // Now that we know doc freq for previous doc, - // write it & lastDocCode - sliceWriter.init(p.freqUpto); - - if (1 == p.docFreq) - sliceWriter.writeVInt(p.lastDocCode|1); - else { - sliceWriter.writeVInt(p.lastDocCode); - sliceWriter.writeVInt(p.docFreq); - } - p.freqUpto = sliceWriter.getAddress(); - - if (doVectors) { - vector = addNewVector(); - if (doVectorOffsets) { - offsetStartCode = offsetStart = offset + token.startOffset(); - offsetEnd = offset + token.endOffset(); - } - } - - proxCode = position; - - p.docFreq = 1; - - // Store code so we can write this after we're - // done with this new doc - p.lastDocCode = (threadState.docID-p.lastDocID) << 1; - p.lastDocID = threadState.docID; - - } else { // term already seen in this doc - // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); - p.docFreq++; - - proxCode = position-p.lastPosition; - - if (doVectors) { - vector = p.vector; - if (vector == null) - vector = addNewVector(); - if (doVectorOffsets) { - offsetStart = offset + token.startOffset(); - offsetEnd = offset + token.endOffset(); - offsetStartCode = offsetStart-vector.lastOffset; - } - } - } - } else { // term not seen before - // System.out.println(" never seen docID=" + docID); - - // Refill? - if (0 == threadState.postingsFreeCount) { - threadState.docWriter.getPostings(threadState.postingsFreeList); - threadState.postingsFreeCount = threadState.postingsFreeList.length; - } - - final int textLen1 = 1+tokenTextLen; - if (textLen1 + threadState.charPool.byteUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { - if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { - // Just skip this term, to remain as robust as - // possible during indexing. A TokenFilter - // can be inserted into the analyzer chain if - // other behavior is wanted (pruning the term - // to a prefix, throwing an exception, etc). - if (threadState.maxTermPrefix == null) - threadState.maxTermPrefix = new String(tokenText, 0, 30); - - // Still increment position: - position++; - return; - } - threadState.charPool.nextBuffer(); - } - - final char[] text = threadState.charPool.buffer; - final int textUpto = threadState.charPool.byteUpto; - - // Pull next free Posting from free list - p = threadState.postingsFreeList[--threadState.postingsFreeCount]; - - p.textStart = textUpto + threadState.charPool.byteOffset; - threadState.charPool.byteUpto += textLen1; - - System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); - - text[textUpto+tokenTextLen] = 0xffff; - - assert postingsHash[hashPos] == null; - - postingsHash[hashPos] = p; - numPostings++; - - if (numPostings == postingsHashHalfSize) - rehashPostings(2*postingsHashSize); - - // Init first slice for freq & prox streams - final int upto1 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); - p.freqStart = p.freqUpto = threadState.postingsPool.byteOffset + upto1; - - final int upto2 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); - p.proxStart = p.proxUpto = threadState.postingsPool.byteOffset + upto2; - - p.lastDocCode = threadState.docID << 1; - p.lastDocID = threadState.docID; - p.docFreq = 1; - - if (doVectors) { - vector = addNewVector(); - if (doVectorOffsets) { - offsetStart = offsetStartCode = offset + token.startOffset(); - offsetEnd = offset + token.endOffset(); - } - } - - proxCode = position; - } - - sliceWriter.init(p.proxUpto); - - if (payload != null && payload.length > 0) { - sliceWriter.writeVInt((proxCode<<1)|1); - sliceWriter.writeVInt(payload.length); - sliceWriter.writeBytes(payload.data, payload.offset, payload.length); - fieldInfo.storePayloads = true; - } else - sliceWriter.writeVInt(proxCode<<1); - - p.proxUpto = sliceWriter.getAddress(); - p.lastPosition = position++; - - if (doVectorPositions) { - vectorsSliceWriter.init(vector.posUpto); - vectorsSliceWriter.writeVInt(proxCode); - vector.posUpto = vectorsSliceWriter.getAddress(); - } - - if (doVectorOffsets) { - vectorsSliceWriter.init(vector.offsetUpto); - vectorsSliceWriter.writeVInt(offsetStartCode); - vectorsSliceWriter.writeVInt(offsetEnd-offsetStart); - vector.lastOffset = offsetEnd; - vector.offsetUpto = vectorsSliceWriter.getAddress(); - } - } catch (Throwable t) { - throw new AbortException(t, threadState.docWriter); - } - } - - /** Called when postings hash is too small (> 50% - * occupied) or too large (< 20% occupied). */ - void rehashPostings(final int newSize) { - - final int newMask = newSize-1; - - Posting[] newHash = new Posting[newSize]; - for(int i=0;i> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos = start; - while(text[pos] != 0xffff) - pos++; - int code = 0; - while (pos > start) - code = (code*31) + text[--pos]; - - int hashPos = code & newMask; - assert hashPos >= 0; - if (newHash[hashPos] != null) { - final int inc = ((code>>8)+code)|1; - do { - code += inc; - hashPos = code & newMask; - } while (newHash[hashPos] != null); - } - newHash[hashPos] = p0; - } - } - - postingsHashMask = newMask; - postingsHash = newHash; - postingsHashSize = newSize; - postingsHashHalfSize = newSize >> 1; - } - - final ByteSliceReader vectorSliceReader = new ByteSliceReader(); - - /** Called once per field per document if term vectors - * are enabled, to write the vectors to * - * RAMOutputStream, which is then quickly flushed to - * * the real term vectors files in the Directory. */ - void writeVectors(FieldInfo fieldInfo) throws IOException { - - assert fieldInfo.storeTermVector; - assert threadState.vectorFieldsInOrder(fieldInfo); - - threadState.vectorFieldNumbers[threadState.numVectorFields] = fieldInfo.number; - threadState.vectorFieldPointers[threadState.numVectorFields] = threadState.tvfLocal.getFilePointer(); - threadState.numVectorFields++; - - final int numPostingsVectors = postingsVectorsUpto; - final PostingVector[] postingsVectors = threadState.postingsVectors; - - final IndexOutput tvfLocal = threadState.tvfLocal; - - threadState.tvfLocal.writeVInt(numPostingsVectors); - byte bits = 0x0; - if (doVectorPositions) - bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; - if (doVectorOffsets) - bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; - threadState.tvfLocal.writeByte(bits); - - threadState.doVectorSort(postingsVectors, numPostingsVectors); - - int encoderUpto = 0; - int lastTermBytesCount = 0; - - final ByteSliceReader reader = vectorSliceReader; - final char[][] charBuffers = threadState.charPool.buffers; - - for(int j=0;j> DocumentsWriter.CHAR_BLOCK_SHIFT]; - final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - - // We swap between two encoders to save copying - // last Term's byte array - final UnicodeUtil.UTF8Result utf8Result = threadState.utf8Results[encoderUpto]; - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); - final int termBytesCount = utf8Result.length; - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute common prefix between last term and - // this term - int prefix = 0; - if (j > 0) { - final byte[] lastTermBytes = threadState.utf8Results[1-encoderUpto].result; - final byte[] termBytes = threadState.utf8Results[encoderUpto].result; - while(prefix < lastTermBytesCount && prefix < termBytesCount) { - if (lastTermBytes[prefix] != termBytes[prefix]) - break; - prefix++; - } - } - encoderUpto = 1-encoderUpto; - lastTermBytesCount = termBytesCount; - - final int suffix = termBytesCount - prefix; - tvfLocal.writeVInt(prefix); - tvfLocal.writeVInt(suffix); - tvfLocal.writeBytes(utf8Result.result, prefix, suffix); - tvfLocal.writeVInt(freq); - - if (doVectorPositions) { - reader.init(threadState.vectorsPool, vector.posStart, vector.posUpto); - reader.writeTo(tvfLocal); - } - - if (doVectorOffsets) { - reader.init(threadState.vectorsPool, vector.offsetStart, vector.offsetUpto); - reader.writeTo(tvfLocal); - } - } - } -} Index: src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java =================================================================== --- src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (revision 0) +++ src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (revision 0) @@ -0,0 +1,31 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.analysis.Token; +import java.io.IOException; + +abstract class InvertedDocConsumerPerField { + abstract boolean start(Fieldable[] fields, int count, DocumentsWriter.DocState docState) throws IOException; + abstract void start(Fieldable field, DocumentsWriter.DocState docState) throws IOException; + abstract void add(Token token) throws IOException; + abstract void end(Fieldable field, DocumentsWriter.DocState docState) throws IOException; + abstract void finish(DocumentsWriter.DocState docState) throws IOException; + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldConsumerPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumerPerField.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumerPerField.java (revision 0) @@ -0,0 +1,27 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.document.Fieldable; + +abstract class DocFieldConsumerPerField { + /** Processes all occurrences of a single field */ + abstract void processFields(DocumentsWriter.DocState docState, Fieldable[] fields, int count) throws IOException; + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumerPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldConsumersPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumersPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumersPerThread.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +class DocFieldConsumersPerThread extends DocFieldConsumerPerThread { + + final DocFieldConsumerPerThread one; + final DocFieldConsumerPerThread two; + final DocFieldConsumers parent; + + public DocFieldConsumersPerThread(DocFieldConsumers parent, DocFieldConsumerPerThread one, DocFieldConsumerPerThread two) { + this.parent = parent; + this.one = one; + this.two = two; + } + + public void startDocument(DocumentsWriter.DocState docState) throws IOException { + try { + one.startDocument(docState); + } finally { + two.startDocument(docState); + } + } + + public void abort() { + try { + one.abort(); + } finally { + two.abort(); + } + } + + public DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState docState) throws IOException { + final DocumentsWriter.DocWriter oneDoc = one.finishDocument(docState); + final DocumentsWriter.DocWriter twoDoc = two.finishDocument(docState); + if (oneDoc == null) + return twoDoc; + else if (twoDoc == null) + return oneDoc; + else { + DocFieldConsumers.PerDoc both = parent.getPerDoc(); + both.docID = docState.docID; + both.one = oneDoc; + both.two = twoDoc; + return both; + } + } + + public DocFieldConsumerPerField addField(FieldInfo fi) { + return new DocFieldConsumersPerField(this, one.addField(fi), two.addField(fi)); + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumersPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 0) @@ -0,0 +1,304 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; + +/** + * Gathers all Fieldables for a document under the same + * name, updates FieldInfos, and calls per-field consumers + * to process field by field. + * + * Currently, only a single thread visits the fields, + * sequentially, for processing. + */ + +class DocFieldProcessorPerThread extends DocConsumerPerThread { + + float docBoost; + int fieldGen; + final DocFieldProcessor docFieldProcessor; + final FieldInfos fieldInfos; + final DocFieldConsumerPerThread consumer; + + // Holds all fields seen in current doc + DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1]; + int fieldCount; + + // Hash table for all fields ever seen + DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2]; + int hashMask = 1; + int totalFieldCount; + + public DocFieldProcessorPerThread(DocFieldProcessor docFieldProcessor) throws IOException { + this.docFieldProcessor = docFieldProcessor; + this.fieldInfos = docFieldProcessor.fieldInfos; + this.consumer = docFieldProcessor.consumer.addThread(); + } + + public void abort() { + for(int i=0;i= fieldHash.length/2) { + final int newHashSize = (int) (fieldHash.length*2); + assert newHashSize > fieldHash.length; + + final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize]; + + // Rehash + int newHashMask = newHashSize-1; + for(int j=0;j= hi) + return; + else if (hi == 1+lo) { + if (array[lo].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) { + final DocFieldProcessorPerField tmp = array[lo]; + array[lo] = array[hi]; + array[hi] = tmp; + } + return; + } + + int mid = (lo + hi) >>> 1; + + if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) { + DocFieldProcessorPerField tmp = array[lo]; + array[lo] = array[mid]; + array[mid] = tmp; + } + + if (array[mid].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) { + DocFieldProcessorPerField tmp = array[mid]; + array[mid] = array[hi]; + array[hi] = tmp; + + if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) { + DocFieldProcessorPerField tmp2 = array[lo]; + array[lo] = array[mid]; + array[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + DocFieldProcessorPerField partition = array[mid]; + + for (; ;) { + while (array[right].fieldInfo.name.compareTo(partition.fieldInfo.name) > 0) + --right; + + while (left < right && array[left].fieldInfo.name.compareTo(partition.fieldInfo.name) <= 0) + ++left; + + if (left < right) { + DocFieldProcessorPerField tmp = array[left]; + array[left] = array[right]; + array[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(array, lo, left); + quickSort(array, left + 1, hi); + } + + DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState docState) throws IOException { + return consumer.finishDocument(docState); + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldConsumers.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumers.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumers.java (revision 0) @@ -0,0 +1,158 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.HashSet; +import java.io.IOException; + +import org.apache.lucene.util.ArrayUtil; + +/** This is just a "splitter" class: it lets you wrap two + * DocFieldConsumer instances into a single consumer. */ +class DocFieldConsumers extends DocFieldConsumer { + final DocFieldConsumer one; + final DocFieldConsumer two; + + public DocFieldConsumers(DocFieldConsumer one, DocFieldConsumer two) { + this.one = one; + this.two = two; + } + + public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException { + + Map oneThreadsAndFields = new HashMap(); + Map twoThreadsAndFields = new HashMap(); + + Iterator it = threadsAndFields.entrySet().iterator(); + while(it.hasNext()) { + + Map.Entry entry = (Map.Entry) it.next(); + + DocFieldConsumersPerThread perThread = (DocFieldConsumersPerThread) entry.getKey(); + + Collection fields = (Collection) entry.getValue(); + + Iterator fieldsIt = fields.iterator(); + Collection oneFields = new HashSet(); + Collection twoFields = new HashSet(); + while(fieldsIt.hasNext()) { + DocFieldConsumersPerField perField = (DocFieldConsumersPerField) fieldsIt.next(); + oneFields.add(perField.one); + twoFields.add(perField.two); + } + + oneThreadsAndFields.put(perThread.one, oneFields); + twoThreadsAndFields.put(perThread.two, twoFields); + } + + + try { + one.flush(oneThreadsAndFields, state); + } finally { + two.flush(twoThreadsAndFields, state); + } + } + + public void closeDocStore(DocumentsWriter.FlushState state) throws IOException { + try { + one.closeDocStore(state); + } finally { + two.closeDocStore(state); + } + } + + public void abort() { + try { + one.abort(); + } finally { + two.abort(); + } + } + + public boolean freeRAM() { + boolean any = one.freeRAM(); + any |= two.freeRAM(); + return any; + } + + public DocFieldConsumerPerThread addThread() throws IOException { + return new DocFieldConsumersPerThread(this, one.addThread(), two.addThread()); + } + + PerDoc[] docFreeList = new PerDoc[1]; + int freeCount; + + synchronized PerDoc getPerDoc() { + if (freeCount == 0) + return new PerDoc(); + else + return docFreeList[--freeCount]; + } + + synchronized void freePerDoc(PerDoc perDoc) { + // nocommit -- maybe grow freeList on allocation? + // Recycle + if (freeCount == docFreeList.length) { + final PerDoc[] newArray = new PerDoc[ArrayUtil.getNextSize(docFreeList.length)]; + System.arraycopy(docFreeList, 0, newArray, 0, freeCount); + docFreeList = newArray; + } + docFreeList[freeCount++] = perDoc; + } + + class PerDoc extends DocumentsWriter.DocWriter { + + // TODO: use something more memory efficient; for small + // docs the 1024 buffer size of RAMOutputStream wastes + // alot + DocumentsWriter.DocWriter one; + DocumentsWriter.DocWriter two; + + public long sizeInBytes() { + return one.sizeInBytes() + two.sizeInBytes(); + } + + public void finish() throws IOException { + try { + try { + one.finish(); + } finally { + two.finish(); + } + } finally { + freePerDoc(this); + } + } + + public void abort() { + try { + try { + one.abort(); + } finally { + two.abort(); + } + } finally { + freePerDoc(this); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumers.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/InvertedDocEndConsumer.java =================================================================== --- src/java/org/apache/lucene/index/InvertedDocEndConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/InvertedDocEndConsumer.java (revision 0) @@ -0,0 +1,28 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.io.IOException; + +abstract class InvertedDocEndConsumer { + abstract InvertedDocEndConsumerPerThread addThread(); + abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException; + abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException; + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/InvertedDocEndConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermsHashConsumer.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/TermsHashConsumer.java (revision 0) @@ -0,0 +1,30 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; + +abstract class TermsHashConsumer { + abstract int bytesPerPosting(); + abstract void createPostings(RawPostingList[] postings, int start, int count); + abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread); + abstract void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException; + abstract void abort(); + abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException; +} Property changes on: src/java/org/apache/lucene/index/TermsHashConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/Posting.java =================================================================== --- src/java/org/apache/lucene/index/Posting.java (revision 664673) +++ src/java/org/apache/lucene/index/Posting.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Used by DocumentsWriter to track postings for a single - * term. One of these exists per unique term seen since the - * last flush. If you alter this class you must also fix - * DocumentWriter.POSTING_NUM_BYTE to reflect the change as - * this is how RAM usage is measured. */ -final class Posting { - int textStart; // Address into char[] blocks where our text is stored - int docFreq; // # times this term occurs in the current doc - int freqStart; // Address of first byte[] slice for freq - int freqUpto; // Next write address for freq - int proxStart; // Address of first byte[] slice - int proxUpto; // Next write address for prox - int lastDocID; // Last docID where this term occurred - int lastDocCode; // Code for prior doc - int lastPosition; // Last position where this term occurred - PostingVector vector; // Corresponding PostingVector instance -} Index: src/java/org/apache/lucene/index/DocFieldConsumersPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumersPerField.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumersPerField.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.document.Fieldable; + +class DocFieldConsumersPerField extends DocFieldConsumerPerField { + + final DocFieldConsumerPerField one; + final DocFieldConsumerPerField two; + final DocFieldConsumersPerThread perThread; + + public DocFieldConsumersPerField(DocFieldConsumersPerThread perThread, DocFieldConsumerPerField one, DocFieldConsumerPerField two) { + this.perThread = perThread; + this.one = one; + this.two = two; + } + + public void processFields(DocumentsWriter.DocState docState, Fieldable[] fields, int count) throws IOException { + try { + one.processFields(docState, fields, count); + } finally { + two.processFields(docState, fields, count); + } + } + + public void abort() { + try { + one.abort(); + } finally { + two.abort(); + } + } +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumersPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java =================================================================== --- src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.store.IndexOutput; + +class StoredFieldsWriterPerThread extends DocFieldConsumerPerThread { + + final FieldsWriter localFieldsWriter; + final StoredFieldsWriter storedFieldsWriter; + //FieldsWriter fieldsWriter; // local copy from StoredFieldsWriter + + StoredFieldsWriter.PerDoc doc; + + public StoredFieldsWriterPerThread(StoredFieldsWriter storedFieldsWriter) throws IOException { + this.storedFieldsWriter = storedFieldsWriter; + doc = storedFieldsWriter.getPerDoc(); + localFieldsWriter = new FieldsWriter((IndexOutput) null, (IndexOutput) null, storedFieldsWriter.docWriter.fieldInfos); + } + + public void startDocument(DocumentsWriter.DocState docState) { + doc.docID = docState.docID; + localFieldsWriter.setFieldsStream(doc.fdt); + assert doc.numStoredFields == 0; + assert 0 == doc.fdt.length(); + assert 0 == doc.fdt.getFilePointer(); + } + + public DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState docState) { + // nocommit -- this isn't true: fix this to be like tvx: + // "catch up" for any holes + // We always must return a doc: even if there are no + // stored fields, the fdx/fdt files are created + try { + return doc; + } finally { + doc = storedFieldsWriter.getPerDoc(); + } + } + + public void abort() { + if (doc != null) + doc.reset(); + } + + public DocFieldConsumerPerField addField(FieldInfo fieldInfo) { + return new StoredFieldsWriterPerField(this, fieldInfo); + } +} Property changes on: src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DocFieldConsumerPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldConsumerPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/DocFieldConsumerPerThread.java (revision 0) @@ -0,0 +1,27 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +abstract class DocFieldConsumerPerThread { + abstract void startDocument(DocumentsWriter.DocState docState) throws IOException; + abstract DocumentsWriter.DocWriter finishDocument(DocumentsWriter.DocState docState) throws IOException; + abstract DocFieldConsumerPerField addField(FieldInfo fi); + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/DocFieldConsumerPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/InvertedDocEndConsumerPerField.java =================================================================== --- src/java/org/apache/lucene/index/InvertedDocEndConsumerPerField.java (revision 0) +++ src/java/org/apache/lucene/index/InvertedDocEndConsumerPerField.java (revision 0) @@ -0,0 +1,26 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Fieldable; + +abstract class InvertedDocEndConsumerPerField { + abstract void start(Fieldable[] fields, int count, DocumentsWriter.DocState docState); + abstract void finish(DocumentsWriter.DocState docState); + abstract void abort(); +} Property changes on: src/java/org/apache/lucene/index/InvertedDocEndConsumerPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/StoredFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/StoredFieldsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/StoredFieldsWriter.java (revision 0) @@ -0,0 +1,140 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.io.IOException; +import org.apache.lucene.store.RAMOutputStream; + +/** This is a DocFieldConsumer that writes stored fields. */ +class StoredFieldsWriter extends DocFieldConsumer { + + FieldsWriter fieldsWriter; + final DocumentsWriter docWriter; + + PerDoc[] docFreeList = new PerDoc[1]; + int freeCount; + + public StoredFieldsWriter(DocumentsWriter docWriter) { + this.docWriter = docWriter; + } + + public DocFieldConsumerPerThread addThread() throws IOException { + return new StoredFieldsWriterPerThread(this); + } + + // We do nothing on flush because we write, immediately, + // to a doc store file + public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) {} + + synchronized public void closeDocStore(DocumentsWriter.FlushState state) throws IOException { + if (fieldsWriter != null) { + fieldsWriter.close(); + fieldsWriter = null; + + assert state.docStoreSegmentName != null; + state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION); + state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); + + state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION); + state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); + + if (4+state.numDocsInStore*8 != state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION)) + throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); + } + } + + synchronized PerDoc getPerDoc() { + if (freeCount == 0) + return new PerDoc(); + else + return docFreeList[--freeCount]; + } + + synchronized void abort() { + if (fieldsWriter != null) { + try { + fieldsWriter.close(); + } catch (Throwable t) { + } + fieldsWriter = null; + } + } + + synchronized void finishDocument(PerDoc perDoc) throws IOException { + if (fieldsWriter == null) { + fieldsWriter = new FieldsWriter(docWriter.directory, + docWriter.docStoreSegment, + docWriter.fieldInfos); + + assert docWriter.docStoreSegment != null; + docWriter.addOpenFile(docWriter.docStoreSegment + "." + IndexFileNames.FIELDS_EXTENSION); + docWriter.addOpenFile(docWriter.docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); + } + + // Append stored fields to the real FieldsWriter: + fieldsWriter.flushDocument(perDoc.numStoredFields, perDoc.fdt); + perDoc.reset(); + + free(perDoc); + } + + public boolean freeRAM() { + return false; + } + + void free(PerDoc perDoc) { + + // nocommit -- maybe grow freeList on allocation? + // Recycle + if (freeCount == docFreeList.length) { + final PerDoc[] newArray = new PerDoc[docFreeList.length*2]; + System.arraycopy(docFreeList, 0, newArray, 0, freeCount); + docFreeList = newArray; + } + docFreeList[freeCount++] = perDoc; + } + + class PerDoc extends DocumentsWriter.DocWriter { + + // TODO: use something more memory efficient; for small + // docs the 1024 buffer size of RAMOutputStream wastes alot + RAMOutputStream fdt = new RAMOutputStream(); + int numStoredFields; + + void reset() { + fdt.reset(); + numStoredFields = 0; + } + + void abort() { + reset(); + free(this); + } + + public long sizeInBytes() { + // nocommit -- must get quanizted to buffer_size + return fdt.length(); + //return fdt.sizeInBytes(); + } + + public void finish() throws IOException { + finishDocument(this); + } + } +} Property changes on: src/java/org/apache/lucene/index/StoredFieldsWriter.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermVectorsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (revision 0) @@ -0,0 +1,262 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; + +class TermVectorsTermsWriter extends TermsHashConsumer { + + final DocumentsWriter docWriter; + TermVectorsWriter termVectorsWriter; + PerDoc[] docFreeList = new PerDoc[1]; + int freeCount; + IndexOutput tvx; + IndexOutput tvd; + IndexOutput tvf; + + public TermVectorsTermsWriter(DocumentsWriter docWriter) { + this.docWriter = docWriter; + } + + public TermsHashConsumerPerThread addThread(TermsHashPerThread termsHashPerThread) { + return new TermVectorsTermsWriterPerThread(termsHashPerThread, this); + } + + void createPostings(RawPostingList[] postings, int start, int count) { + final int end = start + count; + for(int i=start;i 0) { + for(int i=0;i 0) - fp.resetPostingArrays(); - } } - - /** Move all per-document state that was accumulated in - * the ThreadState into the "real" stores. */ - public void writeDocument() throws IOException, AbortException { - - // If we hit an exception while appending to the - // stored fields or term vectors files, we have to - // abort all documents since we last flushed because - // it means those files are possibly inconsistent. - try { - - docWriter.numDocsInStore++; - - // Append stored fields to the real FieldsWriter: - docWriter.fieldsWriter.flushDocument(numStoredFields, fdtLocal); - fdtLocal.reset(); - - // Append term vectors to the real outputs: - final IndexOutput tvx = docWriter.tvx; - final IndexOutput tvd = docWriter.tvd; - final IndexOutput tvf = docWriter.tvf; - if (tvx != null) { - tvx.writeLong(tvd.getFilePointer()); - tvx.writeLong(tvf.getFilePointer()); - tvd.writeVInt(numVectorFields); - if (numVectorFields > 0) { - for(int i=0;i= hi) - return; - else if (hi == 1+lo) { - if (comparePostings(postings[lo], postings[hi]) > 0) { - final Posting tmp = postings[lo]; - postings[lo] = postings[hi]; - postings[hi] = tmp; - } - return; - } - - int mid = (lo + hi) >>> 1; - - if (comparePostings(postings[lo], postings[mid]) > 0) { - Posting tmp = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp; - } - - if (comparePostings(postings[mid], postings[hi]) > 0) { - Posting tmp = postings[mid]; - postings[mid] = postings[hi]; - postings[hi] = tmp; - - if (comparePostings(postings[lo], postings[mid]) > 0) { - Posting tmp2 = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp2; - } - } - - int left = lo + 1; - int right = hi - 1; - - if (left >= right) - return; - - Posting partition = postings[mid]; - - for (; ;) { - while (comparePostings(postings[right], partition) > 0) - --right; - - while (left < right && comparePostings(postings[left], partition) <= 0) - ++left; - - if (left < right) { - Posting tmp = postings[left]; - postings[left] = postings[right]; - postings[right] = tmp; - --right; - } else { - break; - } - } - - quickSort(postings, lo, left); - quickSort(postings, left + 1, hi); - } - - /** Do in-place sort of PostingVector array */ - void doVectorSort(PostingVector[] postings, int numPosting) { - quickSort(postings, 0, numPosting-1); - } - - void quickSort(PostingVector[] postings, int lo, int hi) { - if (lo >= hi) - return; - else if (hi == 1+lo) { - if (comparePostings(postings[lo].p, postings[hi].p) > 0) { - final PostingVector tmp = postings[lo]; - postings[lo] = postings[hi]; - postings[hi] = tmp; - } - return; - } - - int mid = (lo + hi) >>> 1; - - if (comparePostings(postings[lo].p, postings[mid].p) > 0) { - PostingVector tmp = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp; - } - - if (comparePostings(postings[mid].p, postings[hi].p) > 0) { - PostingVector tmp = postings[mid]; - postings[mid] = postings[hi]; - postings[hi] = tmp; - - if (comparePostings(postings[lo].p, postings[mid].p) > 0) { - PostingVector tmp2 = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp2; - } - } - - int left = lo + 1; - int right = hi - 1; - - if (left >= right) - return; - - PostingVector partition = postings[mid]; - - for (; ;) { - while (comparePostings(postings[right].p, partition.p) > 0) - --right; - - while (left < right && comparePostings(postings[left].p, partition.p) <= 0) - ++left; - - if (left < right) { - PostingVector tmp = postings[left]; - postings[left] = postings[right]; - postings[right] = tmp; - --right; - } else { - break; - } - } - - quickSort(postings, lo, left); - quickSort(postings, left + 1, hi); - } - - void quickSort(DocumentsWriterFieldData[] array, int lo, int hi) { - if (lo >= hi) - return; - else if (hi == 1+lo) { - if (array[lo].compareTo(array[hi]) > 0) { - final DocumentsWriterFieldData tmp = array[lo]; - array[lo] = array[hi]; - array[hi] = tmp; - } - return; - } - - int mid = (lo + hi) >>> 1; - - if (array[lo].compareTo(array[mid]) > 0) { - DocumentsWriterFieldData tmp = array[lo]; - array[lo] = array[mid]; - array[mid] = tmp; - } - - if (array[mid].compareTo(array[hi]) > 0) { - DocumentsWriterFieldData tmp = array[mid]; - array[mid] = array[hi]; - array[hi] = tmp; - - if (array[lo].compareTo(array[mid]) > 0) { - DocumentsWriterFieldData tmp2 = array[lo]; - array[lo] = array[mid]; - array[mid] = tmp2; - } - } - - int left = lo + 1; - int right = hi - 1; - - if (left >= right) - return; - - DocumentsWriterFieldData partition = array[mid]; - - for (; ;) { - while (array[right].compareTo(partition) > 0) - --right; - - while (left < right && array[left].compareTo(partition) <= 0) - ++left; - - if (left < right) { - DocumentsWriterFieldData tmp = array[left]; - array[left] = array[right]; - array[right] = tmp; - --right; - } else { - break; - } - } - - quickSort(array, lo, left); - quickSort(array, left + 1, hi); - } - - /** If there are fields we've seen but did not see again - * in the last run, then free them up. Also reduce - * postings hash size. */ - void trimFields() { - - int upto = 0; - for(int i=0;i> CHAR_BLOCK_SHIFT]; - int upto = p.textStart & CHAR_BLOCK_MASK; - while(text[upto] != 0xffff) - upto++; - return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK)); - } - */ - - /** Compares term text for two Posting instance and - * returns -1 if p1 < p2; 1 if p1 > p2; else 0. - */ - int comparePostings(Posting p1, Posting p2) { - if (p1 == p2) - return 0; - final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - - assert text1 != text2 || pos1 != pos2; - - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else - // This method should never compare equal postings - // unless p1==p2 - assert c1 != 0xffff; - } - } - - String lastVectorFieldName; - - // Called only by assert - final boolean clearLastVectorFieldName() { - lastVectorFieldName = null; - return true; - } - - // Called only by assert - final boolean vectorFieldsInOrder(FieldInfo fi) { - try { - if (lastVectorFieldName != null) - return lastVectorFieldName.compareTo(fi.name) < 0; - else - return true; - } finally { - lastVectorFieldName = fi.name; - } - } - - PostingVector[] postingsVectors = new PostingVector[1]; - int maxPostingsVectors; - - // Used to read a string value for a field - ReusableStringReader stringReader = new ReusableStringReader(); - - final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(), - new UnicodeUtil.UTF8Result()}; } - Index: src/java/org/apache/lucene/index/RawPostingList.java =================================================================== --- src/java/org/apache/lucene/index/RawPostingList.java (revision 0) +++ src/java/org/apache/lucene/index/RawPostingList.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** This is the base class for an in-memory posting list, + * keyed by a Token. {@link TermsHash} maintains a hash + * table holding one instance of this per unique Token. + * Consumers of TermsHash (@link TermsHashConsumer} must + * subclass this class with its own concrete class. + * {@link FreqProxTermsWriter.RawPostingList} is the + * subclass used for the freq/prox postings, and {@link + * TermVectorsTermsWriter.PostingList} is the subclass + * used to hold TermVectors postings. */ + +abstract class RawPostingList { + final static int BYTES_SIZE = DocumentsWriter.OBJECT_HEADER_BYTES + 3*DocumentsWriter.INT_NUM_BYTE; + int textStart; + int intStart; + int byteStart; +} Property changes on: src/java/org/apache/lucene/index/RawPostingList.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/NormsWriterPerThread.java =================================================================== --- src/java/org/apache/lucene/index/NormsWriterPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/NormsWriterPerThread.java (revision 0) @@ -0,0 +1,39 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +class NormsWriterPerThread extends InvertedDocEndConsumerPerThread { + final NormsWriter normsWriter; + + public NormsWriterPerThread(NormsWriter normsWriter) { + this.normsWriter = normsWriter; + } + + InvertedDocEndConsumerPerField addField(final FieldInfo fieldInfo) { + return new NormsWriterPerField(this, fieldInfo); + } + + void abort() {} + + void startDocument(DocumentsWriter.DocState docState) {} + void finishDocument(DocumentsWriter.DocState docState) {} + + boolean freeRAM() { + return false; + } +} Property changes on: src/java/org/apache/lucene/index/NormsWriterPerThread.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermsHashPerThread.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerThread.java (revision 0) +++ src/java/org/apache/lucene/index/TermsHashPerThread.java (revision 0) @@ -0,0 +1,132 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +class TermsHashPerThread extends InvertedDocConsumerPerThread { + + final TermsHash termsHash; + final TermsHashConsumerPerThread consumer; + final TermsHashPerThread nextPerThread; + + final CharBlockPool charPool; + final IntBlockPool intPool; + final ByteBlockPool bytePool; + final boolean primary; + + final RawPostingList freePostings[] = new RawPostingList[256]; + int freePostingsCount; + + // nocommit -- infoStream needs to pick this up somewhere + String maxTermPrefix; + + public TermsHashPerThread(final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) { + this.termsHash = termsHash; + this.consumer = termsHash.consumer.addThread(this); + + if (nextTermsHash != null) { + // We are primary + charPool = new CharBlockPool(termsHash.docWriter); + primary = true; + } else { + charPool = primaryPerThread.charPool; + primary = false; + } + + intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations); + bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations); + + if (nextTermsHash != null) + nextPerThread = nextTermsHash.addThread(this); + else + nextPerThread = null; + } + + /* + public TermsHashConsumerPerField[] fields() { + InvertedDocConsumerPerField[] fields = parent.fields(); + TermsHashConsumerPerField[] childFields = new TermsHashConsumerPerField[fields.length]; + for(int i=0;i 0 || waitQueue.numWaiting == 0; return abortCount > 0; } @@ -527,45 +485,69 @@ return true; } + synchronized private void initFlushState() { + initSegmentName(); + + if (flushState == null) { + flushState = new FlushState(); + flushState.directory = directory; + flushState.docWriter = this; + } + + flushState.docStoreSegmentName = docStoreSegment; + flushState.segmentName = segment; + flushState.numDocsInRAM = numDocsInRAM; + flushState.numDocsInStore = numDocsInStore; + flushState.flushedFiles = new HashSet(); + } + /** Flush all pending docs to a new segment */ synchronized int flush(boolean closeDocStore) throws IOException { assert allThreadsIdle(); - if (segment == null) - // In case we are asked to flush an empty segment - segment = writer.newSegmentName(); + // nocommit: this assert isn't right? + assert numDocsInRAM > 0; - newFiles = new ArrayList(); + assert nextDocID == numDocsInRAM; + assert waitQueue.numWaiting == 0; + assert waitQueue.waitingBytes == 0; + initFlushState(); + docStoreOffset = numDocsInStore; - int docCount; - - assert numDocsInRAM > 0; - if (infoStream != null) - message("flush postings as segment " + segment + " numDocs=" + numDocsInRAM); + message("flush postings as segment " + flushState.segmentName + " numDocs=" + numDocsInRAM); boolean success = false; try { if (closeDocStore) { - assert docStoreSegment != null; - assert docStoreSegment.equals(segment); - newFiles.addAll(files()); + assert flushState.docStoreSegmentName != null; + assert flushState.docStoreSegmentName.equals(flushState.segmentName); closeDocStore(); } - - fieldInfos.write(directory, segment + ".fnm"); - docCount = numDocsInRAM; + Collection threads = new HashSet(); + for(int i=0;i 0) - allFields.add(fp); - } - } - - // Sort by field name - Collections.sort(allFields); - final int numAllFields = allFields.size(); - - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - numDocsInRAM, freqOut, proxOut); - - int start = 0; - while(start < numAllFields) { - - final String fieldName = ((DocumentsWriterFieldData) allFields.get(start)).fieldInfo.name; - - int end = start+1; - while(end < numAllFields && ((DocumentsWriterFieldData) allFields.get(end)).fieldInfo.name.equals(fieldName)) - end++; - - DocumentsWriterFieldData[] fields = new DocumentsWriterFieldData[end-start]; - for(int i=start;i 1.5*postingsFreeCount) { - int newSize = postingsFreeList.length; - while(newSize > 1.25*postingsFreeCount) { - newSize = (int) (newSize*0.8); - } - Posting[] newArray = new Posting[newSize]; - System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount); - postingsFreeList = newArray; - } - - return flushedFiles; - } - synchronized void pushDeletes() { deletesFlushed.update(deletesInRAM); } - /** Returns the name of the file with this extension, on - * the current segment we are working on. */ - private String segmentFileName(String extension) { - return segment + "." + extension; - } - - private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else if (0xffff == c1) - return 0; - } - } - - private final TermInfo termInfo = new TermInfo(); // minimize consing - - final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result(); - - /* Walk through all unique text tokens (Posting - * instances) found in this field and serialize them - * into a single RAM segment. */ - void appendPostings(DocumentsWriterFieldData[] fields, - TermInfosWriter termsOut, - IndexOutput freqOut, - IndexOutput proxOut) - throws CorruptIndexException, IOException { - - final int fieldNumber = fields[0].fieldInfo.number; - int numFields = fields.length; - - final DocumentsWriterFieldMergeState[] mergeStates = new DocumentsWriterFieldMergeState[numFields]; - - for(int i=0;i 0) { - - // Get the next term to merge - termStates[0] = mergeStates[0]; - int numToMerge = 1; - - for(int i=1;i 0) { - - if ((++df % skipInterval) == 0) { - skipListWriter.setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); - skipListWriter.bufferSkip(df); - } - - DocumentsWriterFieldMergeState minState = termStates[0]; - for(int i=1;i lastDoc || df == 1; - - final int newDocCode = (doc-lastDoc)<<1; - lastDoc = doc; - - final ByteSliceReader prox = minState.prox; - - // Carefully copy over the prox + payload info, - // changing the format to match Lucene's segment - // format. - for(int j=0;j 0) - copyBytes(prox, proxOut, payloadLength); - } else { - assert 0 == (code & 1); - proxOut.writeVInt(code>>1); - } - } - - if (1 == termDocFreq) { - freqOut.writeVInt(newDocCode|1); - } else { - freqOut.writeVInt(newDocCode); - freqOut.writeVInt(termDocFreq); - } - - if (!minState.nextDoc()) { - - // Remove from termStates - int upto = 0; - for(int i=0;i 0; - - // Done merging this term - - long skipPointer = skipListWriter.writeSkip(freqOut); - - // Write term - termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text, start, termsUTF8); - - // TODO: we could save O(n) re-scan of the term by - // computing the shared prefix with the last term - // while during the UTF8 encoding - termsOut.add(fieldNumber, - termsUTF8.result, - termsUTF8.length, - termInfo); - } - } - synchronized void close() { + // nocommit -- pauseAllThreads here??? closed = true; notifyAll(); } + synchronized void initSegmentName() { + if (segment == null) + segment = writer.newSegmentName(); + if (docStoreSegment == null) + docStoreSegment = segment; + } + /** Returns a free (idle) ThreadState that may be used for * indexing this one document. This call also pauses if a * flush is pending. If delTerm is non-null then we @@ -961,14 +618,16 @@ // again. DocumentsWriterThreadState state = (DocumentsWriterThreadState) threadBindings.get(Thread.currentThread()); if (state == null) { - // First time this thread has called us since last flush + + // First time this thread has called us since last + // flush. Find the least loaded thread state: DocumentsWriterThreadState minThreadState = null; for(int i=0;i= MAX_THREAD_STATE)) { state = minThreadState; state.numThreads++; } else { @@ -987,46 +646,50 @@ // not be paused nor a flush pending: waitReady(state); - if (segment == null) - segment = writer.newSegmentName(); + // Allocate segment name if this is the first doc since + // last flush: + initSegmentName(); state.isIdle = false; + boolean success = false; try { - boolean success = false; - try { - state.init(doc, nextDocID); - if (delTerm != null) { - addDeleteTerm(delTerm, state.docID); - state.doFlushAfter = timeToFlushDeletes(); - } - // Only increment nextDocID & numDocsInRAM on successful init - nextDocID++; - numDocsInRAM++; + state.docState.docID = nextDocID; + assert writer.testPoint("DocumentsWriter.ThreadState.init start"); - // We must at this point commit to flushing to ensure we - // always get N docs when we flush by doc count, even if - // > 1 thread is adding documents: - if (!flushPending && maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH - && numDocsInRAM >= maxBufferedDocs) { - flushPending = true; - state.doFlushAfter = true; - } + // nocommit -- should I put a + // DocConsumer.initDocument here??? + + if (delTerm != null) { + addDeleteTerm(delTerm, state.docState.docID); + state.doFlushAfter = timeToFlushDeletes(); + } - success = true; - } finally { - if (!success) { - // Forcefully idle this ThreadState: - state.isIdle = true; - notifyAll(); - if (state.doFlushAfter) { - state.doFlushAfter = false; - flushPending = false; - } + // nocommit -- fix: Only increment nextDocID & numDocsInRAM on successful init + nextDocID++; + numDocsInRAM++; + + // We must at this point commit to flushing to ensure we + // always get N docs when we flush by doc count, even if + // > 1 thread is adding documents: + if (!flushPending && + maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH + && numDocsInRAM >= maxBufferedDocs) { + flushPending = true; + state.doFlushAfter = true; + } + + success = true; + } finally { + if (!success) { + // Forcefully idle this ThreadState: + state.isIdle = true; + notifyAll(); + if (state.doFlushAfter) { + state.doFlushAfter = false; + flushPending = false; } } - } catch (AbortException ae) { - abort(ae); } return state; @@ -1049,20 +712,33 @@ // This call is synchronized but fast final DocumentsWriterThreadState state = getThreadState(doc, delTerm); + + final DocState docState = state.docState; + docState.doc = doc; + docState.analyzer = analyzer; + + // nocommit -- only set these on change through IW + docState.maxFieldLength = writer.getMaxFieldLength(); + docState.infoStream = infoStream; + docState.similarity = writer.getSimilarity(); + try { boolean success = false; try { try { - // This call is not synchronized and does all the work - state.processDocument(analyzer); + // This call is not synchronized and does all the + // work + state.consumer.processDocument(docState); } finally { // Note that we must call finishDocument even on // exception, because for a non-aborting // exception, a portion of the document has been - // indexed (and its ID is marked for deletion), so - // all index files must be updated to record this - // document. This call is synchronized but fast. - finishDocument(state); + // indexed (and its ID is marked for deletion, + // below), so all index files must be updated to + // record this document. This call is + // synchronized but fast. + final DocWriter perDoc = state.consumer.finishDocument(docState); + finishDocument(state, perDoc); } success = true; } finally { @@ -1081,7 +757,7 @@ // since likely it was partially added. This // keeps indexing as "all or none" (atomic) when // adding a document: - addDeleteDocID(state.docID); + addDeleteDocID(state.docState.docID); } } } @@ -1114,12 +790,13 @@ } synchronized private void waitReady(DocumentsWriterThreadState state) { - while(!closed && ((state != null && !state.isIdle) || pauseThreads != 0 || flushPending || abortCount > 0)) + while(!closed && ((state != null && !state.isIdle) || pauseThreads != 0 || flushPending || abortCount > 0)) { try { wait(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } + } if (closed) throw new AlreadyClosedException("this IndexWriter is closed"); @@ -1298,63 +975,44 @@ } /** Does the synchronized work to finish/flush the - * inverted document. */ - private synchronized void finishDocument(DocumentsWriterThreadState state) throws IOException, AbortException { - if (abortCount > 0) { - // Forcefully idle this threadstate -- its state will - // be reset by abort() - state.isIdle = true; - notifyAll(); - return; - } + * inverted document. */ + private void finishDocument(DocumentsWriterThreadState perThread, DocWriter docWriter) throws IOException, AbortException { - if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH - && numBytesUsed >= ramBufferSize) + // nocommit -- if docWriter is null, we must put a + // "placeholder" into the queue, else we will think that + // doc isn't done yet. + + boolean doBalance = ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && numBytesUsed >= ramBufferSize; + + if (doBalance) balanceRAM(); - // Now write the indexed document to the real files. - if (nextWriteDocID == state.docID) { - // It's my turn, so write everything now: - nextWriteDocID++; - state.writeDocument(); - state.isIdle = true; + synchronized(this) { + + assert docWriter == null || docWriter.docID == perThread.docState.docID; + + perThread.isIdle = true; notifyAll(); - // If any states were waiting on me, sweep through and - // flush those that are enabled by my write. - if (numWaiting > 0) { - boolean any = true; - while(any) { - any = false; - for(int i=0;i i+1) - // Swap in the last waiting state to fill in - // the hole we just created. It's important - // to do this as-we-go and not at the end of - // the loop, because if we hit an aborting - // exception in one of the s.writeDocument - // calls (above), it leaves this array in an - // inconsistent state: - waitingThreadStates[i] = waitingThreadStates[numWaiting-1]; - numWaiting--; - } else { - assert !s.isIdle; - i++; - } - } - } + if (abortCount > 0) { + + // We are currently aborting, and another thread is + // waiting for me to become idle. We just forcefully + // idle this threadState; it will be fully reset by + // abort() + if (docWriter != null) + docWriter.abort(); + + return; } - } else { - // Another thread got a docID before me, but, it - // hasn't finished its processing. So add myself to - // the line but don't hold up this thread. - waitingThreadStates[numWaiting++] = state; + + if (docWriter != null) + waitQueue.add(docWriter); + + if (bufferIsFull && !flushPending) { + flushPending = true; + perThread.doFlushAfter = true; + } } } @@ -1367,35 +1025,10 @@ NumberFormat nf = NumberFormat.getInstance(); - /* Used only when writing norms to fill in default norm - * value into the holes in docID stream for those docs - * that didn't have this field. */ - static void fillBytes(IndexOutput out, byte b, int numBytes) throws IOException { - for(int i=0;i - // IndexOutput - while(numBytes > 0) { - final int chunk; - if (numBytes > 4096) - chunk = 4096; - else - chunk = (int) numBytes; - srcIn.readBytes(copyByteBuffer, 0, chunk); - destIn.writeBytes(copyByteBuffer, 0, chunk); - numBytes -= chunk; - } - } - - // Used only when infoStream != null + // TODO FI: this is not flexible -- we can't hardwire + // extensions in here: private long segmentSize(String segmentName) throws IOException { + // Used only when infoStream != null assert infoStream != null; long size = directory.fileLength(segmentName + ".tii") + @@ -1410,62 +1043,12 @@ return size; } - final private static int POINTER_NUM_BYTE = 4; - final private static int INT_NUM_BYTE = 4; - final private static int CHAR_NUM_BYTE = 2; + // Coarse estimates used to measure RAM usage of buffered deletes + final static int OBJECT_HEADER_BYTES = 8; + final static int POINTER_NUM_BYTE = 4; + final static int INT_NUM_BYTE = 4; + final static int CHAR_NUM_BYTE = 2; - // Why + 5*POINTER_NUM_BYTE below? - // 1: Posting has "vector" field which is a pointer - // 2: Posting is referenced by postingsFreeList array - // 3,4,5: Posting is referenced by postings hash, which - // targets 25-50% fill factor; approximate this - // as 3X # pointers - final static int POSTING_NUM_BYTE = OBJECT_HEADER_BYTES + 9*INT_NUM_BYTE + 5*POINTER_NUM_BYTE; - - // Holds free pool of Posting instances - private Posting[] postingsFreeList; - private int postingsFreeCount; - private int postingsAllocCount; - - /* Allocate more Postings from shared pool */ - synchronized void getPostings(Posting[] postings) { - numBytesUsed += postings.length * POSTING_NUM_BYTE; - final int numToCopy; - if (postingsFreeCount < postings.length) - numToCopy = postingsFreeCount; - else - numToCopy = postings.length; - final int start = postingsFreeCount-numToCopy; - System.arraycopy(postingsFreeList, start, - postings, 0, numToCopy); - postingsFreeCount -= numToCopy; - - // Directly allocate the remainder if any - if (numToCopy < postings.length) { - final int extra = postings.length - numToCopy; - final int newPostingsAllocCount = postingsAllocCount + extra; - if (newPostingsAllocCount > postingsFreeList.length) - postingsFreeList = new Posting[(int) (1.25 * newPostingsAllocCount)]; - - balanceRAM(); - for(int i=numToCopy;i freeTrigger) { if (infoStream != null) message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + " vs trigger=" + toMB(flushTrigger) + " allocMB=" + toMB(numBytesAlloc) + " vs trigger=" + toMB(freeTrigger) + - " postingsFree=" + toMB(postingsFreeCount*POSTING_NUM_BYTE) + " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE)); - // When we've crossed 100% of our target Postings - // RAM usage, try to free up until we're back down - // to 95% final long startBytesAlloc = numBytesAlloc; - final int postingsFreeChunk = (int) (BYTE_BLOCK_SIZE / POSTING_NUM_BYTE); - int iter = 0; // We free equally from each pool in 64 KB // chunks until we are below our threshold // (freeLevel) + boolean any = true; + while(numBytesAlloc > freeLevel) { - if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == postingsFreeCount) { - // Nothing else to free -- must flush now. - bufferIsFull = true; - if (infoStream != null) - message(" nothing to free; now set bufferIsFull"); - break; - } + + synchronized(this) { + if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) { + // Nothing else to free -- must flush now. + bufferIsFull = numBytesUsed > flushTrigger; + if (infoStream != null) { + if (numBytesUsed > flushTrigger) + message(" nothing to free; now set bufferIsFull"); + else + message(" nothing to free"); + } + assert numBytesUsed <= numBytesAlloc; + break; + } - if ((0 == iter % 3) && byteBlockAllocator.freeByteBlocks.size() > 0) { - byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); - numBytesAlloc -= BYTE_BLOCK_SIZE; - } + if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) { + byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); + numBytesAlloc -= BYTE_BLOCK_SIZE; + } - if ((1 == iter % 3) && freeCharBlocks.size() > 0) { - freeCharBlocks.remove(freeCharBlocks.size()-1); - numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - } + if ((1 == iter % 4) && freeCharBlocks.size() > 0) { + freeCharBlocks.remove(freeCharBlocks.size()-1); + numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; + } - if ((2 == iter % 3) && postingsFreeCount > 0) { - final int numToFree; - if (postingsFreeCount >= postingsFreeChunk) - numToFree = postingsFreeChunk; - else - numToFree = postingsFreeCount; - Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null); - postingsFreeCount -= numToFree; - postingsAllocCount -= numToFree; - numBytesAlloc -= numToFree * POSTING_NUM_BYTE; + if ((2 == iter % 4) && freeIntBlocks.size() > 0) { + freeIntBlocks.remove(freeIntBlocks.size()-1); + numBytesAlloc -= INT_BLOCK_SIZE * INT_NUM_BYTE; + } } + if ((3 == iter % 4) && any) + // Ask consumer to free any recycled state + any = consumer.freeRAM(); + iter++; } - + if (infoStream != null) message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.) + " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); @@ -1631,14 +1277,125 @@ // using, go ahead and flush. This prevents // over-allocating and then freeing, with every // flush. - if (numBytesUsed > flushTrigger) { - if (infoStream != null) - message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + - " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + - " triggerMB=" + nf.format(flushTrigger/1024./1024.)); + synchronized(this) { - bufferIsFull = true; + if (numBytesUsed > flushTrigger) { + if (infoStream != null) + message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + + " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + + " triggerMB=" + nf.format(flushTrigger/1024./1024.)); + + bufferIsFull = true; + } } } } + + final WaitQueue waitQueue = new WaitQueue(); + + // nocommit -- do we allow a consumer which sometimes + // returns a perDoc and other times doesn't? + private class WaitQueue { + DocWriter[] waiting; + int nextWriteDocID; + int nextWriteLoc; + int numWaiting; + long waitingBytes; + + public WaitQueue() { + waiting = new DocWriter[10]; + } + + synchronized void reset() { + // nocommit -- I think nextWriteLoc can stay as is? + assert numWaiting == 0; + assert waitingBytes == 0; + nextWriteDocID = 0; + } + + synchronized void abort() { + int count = 0; + for(int i=0;i= nextWriteDocID; + + if (doc.docID == nextWriteDocID) { + writeDocument(doc); + while(true) { + doc = waiting[nextWriteLoc]; + if (doc != null) { + numWaiting--; + waiting[nextWriteLoc] = null; + waitingBytes -= doc.sizeInBytes(); + writeDocument(doc); + } else + break; + } + } else { + + // I finished before documents that were added + // before me. This can easily happen when I am a + // small doc and the docs before me were large, or, + // just due to luck in the thread scheduling. Just + // add myself to the queue and when that large doc + // finishes, it will flush me: + int gap = doc.docID - nextWriteDocID; + if (gap >= waiting.length) { + // Grow queue + // nocommit -- must pause when wait queue grows + // too large + DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)]; + assert nextWriteLoc >= 0; + System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc); + System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc); + nextWriteLoc = 0; + waiting = newArray; + gap = doc.docID - nextWriteDocID; + } + + int loc = nextWriteLoc + gap; + if (loc >= waiting.length) + loc -= waiting.length; + + // We should only wrap one time + assert loc < waiting.length; + + // Nobody should be in my spot! + assert waiting[loc] == null; + waiting[loc] = doc; + numWaiting++; + waitingBytes += doc.sizeInBytes(); + } + } + } } Index: src/java/org/apache/lucene/index/IntBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/IntBlockPool.java (revision 0) +++ src/java/org/apache/lucene/index/IntBlockPool.java (revision 0) @@ -0,0 +1,59 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +final class IntBlockPool { + + public int[][] buffers = new int[10][]; + + int bufferUpto = -1; // Which buffer we are upto + public int intUpto = DocumentsWriter.INT_BLOCK_SIZE; // Where we are in head buffer + + public int[] buffer; // Current head buffer + public int intOffset = -DocumentsWriter.INT_BLOCK_SIZE; // Current head offset + + final private DocumentsWriter docWriter; + final boolean trackAllocations; + + public IntBlockPool(DocumentsWriter docWriter, boolean trackAllocations) { + this.docWriter = docWriter; + this.trackAllocations = trackAllocations; + } + + public void reset() { + // nocommit -- does this run HOT when used by term vectors??? + docWriter.recycleIntBlocks(buffers, 0, 1+bufferUpto); + bufferUpto = -1; + intUpto = DocumentsWriter.INT_BLOCK_SIZE; + intOffset = -DocumentsWriter.INT_BLOCK_SIZE; + } + + public void nextBuffer() { + if (1+bufferUpto == buffers.length) { + int[][] newBuffers = new int[(int) (buffers.length*1.5)][]; + System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); + buffers = newBuffers; + } + buffer = buffers[1+bufferUpto] = docWriter.getIntBlock(trackAllocations); + bufferUpto++; + + intUpto = 0; + intOffset += DocumentsWriter.INT_BLOCK_SIZE; + } +} + Property changes on: src/java/org/apache/lucene/index/IntBlockPool.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java (revision 0) +++ src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java (revision 0) @@ -0,0 +1,49 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.document.Fieldable; + +class StoredFieldsWriterPerField extends DocFieldConsumerPerField { + + final StoredFieldsWriterPerThread perThread; + final FieldInfo fieldInfo; + + public StoredFieldsWriterPerField(StoredFieldsWriterPerThread perThread, FieldInfo fieldInfo) { + this.perThread = perThread; + this.fieldInfo = fieldInfo; + } + + // Process all occurrences of a single field in one doc; + // count is 1 if a given field occurs only once in the + // Document, which is the "typical" case + public void processFields(DocumentsWriter.DocState state, Fieldable[] fields, int count) throws IOException { + for(int i=0;i= 0; + + if (!doVectors || numPostings == 0) + return; + + final IndexOutput tvf = perThread.doc.tvf; + + // This is called once, after inverting all occurences + // of a given field in the doc. At this point we flush + // our hash into the DocWriter. + + assert fieldInfo.storeTermVector; + assert perThread.vectorFieldsInOrder(fieldInfo); + + perThread.doc.addField(termsHashPerField.fieldInfo.number); + + final RawPostingList[] postings = termsHashPerField.sortPostings(); + + tvf.writeVInt(numPostings); + byte bits = 0x0; + if (doVectorPositions) + bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; + if (doVectorOffsets) + bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; + tvf.writeByte(bits); + + int encoderUpto = 0; + int lastTermBytesCount = 0; + + final ByteSliceReader reader = perThread.vectorSliceReader; + final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; + for(int j=0;j> DocumentsWriter.CHAR_BLOCK_SHIFT]; + final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + + // We swap between two encoders to save copying + // last Term's byte array + final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; + + // TODO: we could do this incrementally + UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); + final int termBytesCount = utf8Result.length; + + // TODO: UTF16toUTF8 could tell us this prefix + // Compute common prefix between last term and + // this term + int prefix = 0; + if (j > 0) { + final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result; + final byte[] termBytes = perThread.utf8Results[encoderUpto].result; + while(prefix < lastTermBytesCount && prefix < termBytesCount) { + if (lastTermBytes[prefix] != termBytes[prefix]) + break; + prefix++; + } + } + encoderUpto = 1-encoderUpto; + lastTermBytesCount = termBytesCount; + + final int suffix = termBytesCount - prefix; + tvf.writeVInt(prefix); + tvf.writeVInt(suffix); + tvf.writeBytes(utf8Result.result, prefix, suffix); + tvf.writeVInt(freq); + + if (doVectorPositions) { + termsHashPerField.initReader(reader, posting, 0); + reader.writeTo(tvf); + } + + if (doVectorOffsets) { + termsHashPerField.initReader(reader, posting, 1); + reader.writeTo(tvf); + } + } + + termsHashPerField.reset(); + } + + void newTerm(Token t, RawPostingList p0) { + + TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; + + p.freq = 1; + + if (doVectorOffsets) { + final int startOffset = docState.offset + t.startOffset(); + final int endOffset = docState.offset + t.endOffset(); + termsHashPerField.writeVInt(1, startOffset); + termsHashPerField.writeVInt(1, endOffset - startOffset); + p.lastOffset = endOffset; + } + + if (doVectorPositions) { + termsHashPerField.writeVInt(0, docState.position); + p.lastPosition = docState.position; + } + } + + void addTerm(Token t, RawPostingList p0) { + + TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; + p.freq++; + + if (doVectorOffsets) { + final int startOffset = docState.offset + t.startOffset(); + final int endOffset = docState.offset + t.endOffset(); + termsHashPerField.writeVInt(1, startOffset - p.lastOffset); + termsHashPerField.writeVInt(1, endOffset - startOffset); + p.lastOffset = endOffset; + } + + if (doVectorPositions) { + termsHashPerField.writeVInt(0, docState.position - p.lastPosition); + p.lastPosition = docState.position; + } + } + + void skippingLongTerm(Token t) {} +} Property changes on: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/BufferedNorms.java =================================================================== --- src/java/org/apache/lucene/index/BufferedNorms.java (revision 664673) +++ src/java/org/apache/lucene/index/BufferedNorms.java (working copy) @@ -1,60 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import org.apache.lucene.store.RAMOutputStream; -import org.apache.lucene.search.Similarity; - -/* Stores norms, buffered in RAM, until they are flushed - * to a partial segment. */ -final class BufferedNorms { - - RAMOutputStream out; - int upto; - - private static final byte defaultNorm = Similarity.encodeNorm(1.0f); - - BufferedNorms() { - out = new RAMOutputStream(); - } - - void add(float norm) throws IOException { - byte b = Similarity.encodeNorm(norm); - out.writeByte(b); - upto++; - } - - void reset() { - out.reset(); - upto = 0; - } - - void fill(int docID) throws IOException { - // Must now fill in docs that didn't have this - // field. Note that this is how norms can consume - // tremendous storage when the docs have widely - // varying different fields, because we are not - // storing the norms sparsely (see LUCENE-830) - if (upto < docID) { - DocumentsWriter.fillBytes(out, defaultNorm, docID-upto); - upto = docID; - } - } -} - Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 664673) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -1715,7 +1715,7 @@ */ private synchronized boolean flushDocStores() throws IOException { - List files = docWriter.files(); + Collection files = docWriter.openFiles(); boolean useCompoundDocStore = false; @@ -1746,9 +1746,9 @@ try { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, compoundFileName); - final int size = files.size(); - for(int i=0;i 0) { + // It has some norms + List l = (List) byField.get(perField.fieldInfo); + if (l == null) { + l = new ArrayList(); + byField.put(perField.fieldInfo, l); + } + l.add(perField); + } else + // Remove this field since we haven't seen it + // since the previous flush + fieldsIt.remove(); + } + } + + final String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION; + state.flushedFiles.add(normsFileName); + IndexOutput normsOut = state.directory.createOutput(normsFileName); + + // nocommit -- ensure exception here results in abort + + try { + normsOut.writeBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.length); + + final int numField = state.docWriter.fieldInfos.size(); + + int normCount = 0; + + for(int fieldNumber=0;fieldNumber 0) { + + assert uptos[0] < fields[0].docIDs.length : " uptos[0]=" + uptos[0] + " len=" + (fields[0].docIDs.length); + + int minLoc = 0; + int minDocID = fields[0].docIDs[uptos[0]]; + + for(int j=1;j