Index: src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java =================================================================== --- src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java (revision 0) +++ src/test/org/apache/lucene/index/TestTermInfosReaderIndex.java (revision 0) @@ -0,0 +1,147 @@ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + +public class TestTermInfosReaderIndex extends LuceneTestCase { + + private static final int NUMBER_OF_DOCUMENTS = 1000; + private static final int NUMBER_OF_FIELDS = 100; + private TermInfosReaderIndex index; + private Directory directory; + private SegmentTermEnum termEnum; + private int indexDivisor = 1; + private int termIndexInterval; + private int readBufferSize = 1024; + private IndexReader reader; + private List sampleTerms; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = new RAMDirectory(); + termIndexInterval = populate(directory); + index = new TermInfosReaderIndex(); + + String segment = "_0"; + + FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, IndexFileNames.FIELD_INFOS_EXTENSION)); + String segmentFileName = IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION); + long tiiFileLength = directory.fileLength(segmentFileName); + IndexInput input = directory.openInput(segmentFileName, readBufferSize); + termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), readBufferSize), fieldInfos, false); + int totalIndexInterval = termEnum.indexInterval * indexDivisor; + + SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true); + index.build(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval); + indexEnum.close(); + input.close(); + + reader = IndexReader.open(directory); + sampleTerms = sample(reader,1000); + + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + termEnum.close(); + reader.close(); + directory.close(); + } + + public void testSeekEnum() throws CorruptIndexException, IOException { + int indexPosition = 3; + SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone(); + Term term = findTermThatWouldBeAtIndex(clone, indexPosition); + clone.close(); + SegmentTermEnum enumerator = clone; + index.seekEnum(enumerator, indexPosition); + assertEquals(term, enumerator.term()); + } + + public void testCompareTo() throws IOException { + Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText()); + BytesRef termBytesRef = new BytesRef(term.text); + for (int i = 0; i < index.length(); i++) { + Term t = index.getTerm(i); + int compareTo = term.compareTo(t); + assertEquals(compareTo, index.compareTo(term, termBytesRef, i)); + } + } + + public void testRandomSearchPerformance() throws CorruptIndexException, IOException { + IndexSearcher searcher = new IndexSearcher(reader); + for (Term t : sampleTerms) { + TermQuery query = new TermQuery(t); + TopDocs topDocs = searcher.search(query, 10); + assertTrue(topDocs.totalHits > 0); + } + searcher.close(); + } + + private List sample(IndexReader reader, int size) throws IOException { + List sample = new ArrayList(); + Random random = new Random(); + TermEnum terms = reader.terms(); + while (terms.next()) { + if (sample.size() >= size) { + int pos = random.nextInt(size); + sample.set(pos, terms.term()); + } else { + sample.add(terms.term()); + } + } + terms.close(); + Collections.shuffle(sample); + return sample; + } + + private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException { + int termPosition = index * termIndexInterval; + for (int i = 0; i < termPosition; i++) { + if (!termEnum.next()) { + fail("Should not have run out of terms."); + } + } + return termEnum.term(); + } + + private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException { + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new KeywordAnalyzer()); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) { + Document document = new Document(); + for (int f = 0; f < NUMBER_OF_FIELDS; f++) { + document.add(new Field("field" + f,getText(),Store.NO,Index.NOT_ANALYZED_NO_NORMS)); + } + writer.addDocument(document); + } + writer.optimize(); + writer.close(); + return config.getTermIndexInterval(); + } + + private String getText() { + return Long.toString(random.nextLong(),Character.MAX_RADIX); + } +} Index: src/java/org/apache/lucene/index/TermInfosReaderIndex.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReaderIndex.java (revision 0) +++ src/java/org/apache/lucene/index/TermInfosReaderIndex.java (revision 0) @@ -0,0 +1,252 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * This stores a monotonically increasing set of pairs in an + * index segment. Pairs are accessed either by Term or by ordinal position the + * set. The Terms and TermInfo are actually serialized and stored into a byte + * array and pointers to the position of each are stored in a int array. + * + * @author Aaron McCurry amccurry@gmail.com + */ +class TermInfosReaderIndex { + + private int[] indexToTerms; + private byte[] data; + private Term[] fields; + private int totalIndexInterval; + private boolean trim = true; + private Comparator comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); + + /** + * Loads the segment information at segment load time. + * @param indexEnum the term enum. + * @param indexDivisor the index divisor. + * @param tiiFileLength the size of the tii file, used to approximate the size of the buffer. + * @param totalIndexInterval the total index interval. + */ + void build(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { + this.totalIndexInterval = totalIndexInterval; + int indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; + indexToTerms = new int[indexSize]; + // this is only an inital size, it will be GCed once the build is complete + int initialSize = (int) (tiiFileLength * 1.5); + GrowableByteArrayDataOutput output = new GrowableByteArrayDataOutput(new byte[initialSize]); + + String currentField = null; + List fieldStrs = new ArrayList(); + int fieldCounter = -1; + for (int i = 0; indexEnum.next(); i++) { + Term term = indexEnum.term(); + if (currentField != term.field) { + currentField = term.field; + fieldStrs.add(currentField); + fieldCounter++; + } + TermInfo termInfo = indexEnum.termInfo(); + indexToTerms[i] = output.getPosition(); + output.writeVInt(fieldCounter); + output.writeString(term.text()); + output.writeVInt(termInfo.docFreq); + output.writeVInt(termInfo.skipOffset); + output.writeVLong(termInfo.freqPointer); + output.writeVLong(termInfo.proxPointer); + output.writeVLong(indexEnum.indexPointer); + for (int j = 1; j < indexDivisor; j++) + if (!indexEnum.next()) + break; + } + fields = new Term[fieldStrs.size()]; + for (int i = 0; i < fields.length; i++) { + fields[i] = new Term(fieldStrs.get(i)); + } + int size = output.getPosition(); + byte[] compactData = new byte[size]; + byte[] tmpData = output.getBytes(); + if (trim) { + System.arraycopy(tmpData, 0, compactData, 0, size); + this.data = compactData; + } else { + this.data = tmpData; + } + } + + void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + ByteArrayDataInput input = new ByteArrayDataInput(data); + input.setPosition(indexToTerms[indexOffset]); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + Term term = field.createTerm(input.readString()); + + // read the terminfo + TermInfo termInfo = new TermInfo(); + termInfo.docFreq = input.readVInt(); + termInfo.skipOffset = input.readVInt(); + termInfo.freqPointer = input.readVLong(); + termInfo.proxPointer = input.readVLong(); + + long pointer = input.readVLong(); + + // perform the seek + enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); + } + + /** + * Binary search for the given term. + * @param term the term to locate. + */ + int getIndexOffset(Term term, BytesRef termBytesRef) { + int lo = 0; + int hi = indexToTerms.length - 1; + ByteArrayDataInput input = new ByteArrayDataInput(data); + BytesRef bytesRef = new BytesRef(data); + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int delta = compareTo(term, termBytesRef, mid, input, bytesRef); + if (delta < 0) + hi = mid - 1; + else if (delta > 0) + lo = mid + 1; + else + return mid; + } + return hi; + } + + /** + * Gets the term at the given position. + * @param termIndex the position to read the term from the index. + * @return the term. + * @throws IOException + */ + Term getTerm(int termIndex) throws IOException { + ByteArrayDataInput input = new ByteArrayDataInput(data); + input.setPosition(indexToTerms[termIndex]); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + return field.createTerm(input.readString()); + } + + /** + * Returns the number of terms. + * @return int. + */ + int length() { + return indexToTerms.length; + } + + /** + * The compares the given term against the term in the index specified by the term index. + * @param term the given term. + * @param termIndex the index of the of term to compare. + * @return int. + */ + int compareTo(Term term, BytesRef termBytesRef, int termIndex) { + return compareTo(term, termBytesRef, termIndex, new ByteArrayDataInput(data), new BytesRef(data)); + } + + /** + * Compare the fields of the terms first, and if not equals return from compare. If equal compare terms. + * @param term the term to compare. + * @param termIndex the position of the term in the input to compare + * @param input the input buffer. + * @return int. + */ + private int compareTo(Term term, BytesRef termBytesRef, int termIndex, ByteArrayDataInput input, BytesRef inputBytesRef) { + // if term field does not equal mid's field index, then compare fields + // else if they are equal, compare term's string values... + int c = compareField(term, termIndex, input); + if (c == 0) { + inputBytesRef.length = input.readVInt(); + inputBytesRef.offset = input.getPosition(); + return comparator.compare(termBytesRef, inputBytesRef); + } + return c; + } + + /** + * Compares the fields before checking the text of the terms. + * @param term the given term. + * @param termIndex the term that exists in the data block. + * @param input the data block. + * @return int. + */ + private int compareField(Term term, int termIndex, ByteArrayDataInput input) { + input.setPosition(indexToTerms[termIndex]); + return term.field.compareTo(fields[input.readVInt()].field); + } + + /** + * A growable byte array data output. + */ + static class GrowableByteArrayDataOutput extends DataOutput { + private byte[] bytes; + private int pos; + private int limit; + + public GrowableByteArrayDataOutput(byte[] bytes) { + this.bytes = bytes; + this.limit = bytes.length; + } + + public byte[] getBytes() { + return this.bytes; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + if (pos >= limit) { + growArray(); + } + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + if (pos + length >= limit) { + growArray(); + } + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } + + private void growArray() { + bytes = ArrayUtil.grow(bytes); + limit = bytes.length; + } + } +} \ No newline at end of file Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 1173455) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.DoubleBarrelLRUCache; import org.apache.lucene.util.CloseableThreadLocal; @@ -37,9 +38,8 @@ private final SegmentTermEnum origEnum; private final long size; - private final Term[] indexTerms; - private final TermInfo[] indexInfos; - private final long[] indexPointers; + private TermInfosReaderIndex index; + private int indexLength; private final int totalIndexInterval; @@ -111,32 +111,17 @@ totalIndexInterval = origEnum.indexInterval * indexDivisor; final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), readBufferSize), fieldInfos, true); - try { - int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index - - indexTerms = new Term[indexSize]; - indexInfos = new TermInfo[indexSize]; - indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { - indexTerms[i] = indexEnum.term(); - indexInfos[i] = indexEnum.termInfo(); - indexPointers[i] = indexEnum.indexPointer; - - for (int j = 1; j < indexDivisor; j++) - if (!indexEnum.next()) - break; - } + index = new TermInfosReaderIndex(); + index.build(indexEnum, indexDivisor, (int) dir.fileLength(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION), totalIndexInterval); + indexLength = index.length(); } finally { indexEnum.close(); } } else { // Do not load terms index: totalIndexInterval = -1; - indexTerms = null; - indexInfos = null; - indexPointers = null; + index = null; } success = true; } finally { @@ -180,38 +165,14 @@ return resources; } - - /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { - int lo = 0; // binary search indexTerms[] - int hi = indexTerms.length - 1; - - while (hi >= lo) { - int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); - if (delta < 0) - hi = mid - 1; - else if (delta > 0) - lo = mid + 1; - else - return mid; - } - return hi; - } - - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { - enumerator.seek(indexPointers[indexOffset], - ((long) indexOffset * totalIndexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); - } - /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { - return get(term, false); + BytesRef termBytesRef = new BytesRef(term.text); + return get(term, false, termBytesRef); } /** Returns the TermInfo for a Term in the set, or null. */ - private TermInfo get(Term term, boolean mustSeekEnum) throws IOException { + private TermInfo get(Term term, boolean mustSeekEnum, BytesRef termBytesRef) throws IOException { if (size == 0) return null; ensureIndexIsRead(); @@ -231,8 +192,8 @@ && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; - if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { + if (indexLength == enumOffset // but before end of block + || index.compareTo(term,termBytesRef,enumOffset) < 0) { // no need to seek final TermInfo ti; @@ -267,10 +228,10 @@ indexPos = (int) (tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: - indexPos = getIndexOffset(term); + indexPos = index.getIndexOffset(term,termBytesRef); } - seekEnum(enumerator, indexPos); + index.seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { @@ -313,7 +274,7 @@ } private void ensureIndexIsRead() { - if (indexTerms == null) { + if (index == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); } } @@ -323,10 +284,11 @@ if (size == 0) return -1; ensureIndexIsRead(); - int indexOffset = getIndexOffset(term); + BytesRef termBytesRef = new BytesRef(term.text); + int indexOffset = index.getIndexOffset(term,termBytesRef); SegmentTermEnum enumerator = getThreadResources().termEnum; - seekEnum(enumerator, indexOffset); + index.seekEnum(enumerator, indexOffset); while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} @@ -343,7 +305,8 @@ /** Returns an enumeration of terms starting at or after the named term. */ public SegmentTermEnum terms(Term term) throws IOException { - get(term, true); + BytesRef termBytesRef = new BytesRef(term.text); + get(term, true, termBytesRef); return (SegmentTermEnum)getThreadResources().termEnum.clone(); } } Index: src/java/org/apache/lucene/store/ByteArrayDataInput.java =================================================================== --- src/java/org/apache/lucene/store/ByteArrayDataInput.java (revision 1173455) +++ src/java/org/apache/lucene/store/ByteArrayDataInput.java (working copy) @@ -46,6 +46,10 @@ public int getPosition() { return pos; } + + public void setPosition(int pos) { + this.pos = pos; + } public void reset(byte[] bytes, int offset, int len) { this.bytes = bytes;