Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/branches/lucene_2_9:r825998 Reverse-merged /lucene/java/trunk:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458:r824912-931101 Property changes on: backwards\src ___________________________________________________________________ Added: svn:mergeinfo Merged /lucene/java/branches/lucene_2_9_back_compat_tests/src:r818601-821336 Merged /lucene/java/branches/lucene_3_0/src:r880754,880793,880823,881216,881317,881376,881473,881549,881820,882467,882890,883076,883080 Merged /lucene/java/trunk/src:r881213,881315,881466,881819,882374,882807,882888,882977,883074-883075,885214 Merged /lucene/java/branches/lucene_2_4/src:r748824 Merged /lucene/java/trunk/backwards/src:r924483-924731,924781,925176-925462 Merged /lucene/java/branches/flex_1458/backwards/src:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src:r880869-912395 Merged /lucene/java/branches/lucene_2_9/src:r817269-818600,825998,829134,829881,831036 Index: backwards/src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- backwards/src/java/org/apache/lucene/index/SegmentInfo.java (revision 931099) +++ backwards/src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BitVector; +import org.apache.lucene.index.codecs.Codec; import java.io.IOException; import java.util.List; import java.util.Map; @@ -129,6 +130,12 @@ assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } + // stub + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, + Codec codec) { + } + /** * Copy everything from src SegmentInfo into our instance. */ Index: backwards/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- backwards/src/java/org/apache/lucene/index/SegmentMerger.java (revision 931099) +++ backwards/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -29,6 +29,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -98,7 +100,12 @@ } termIndexInterval = writer.getTermIndexInterval(); } - + + // stub + SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs) { + checkAbort = null; + } + boolean hasProx() { return fieldInfos.hasProx(); } @@ -171,6 +178,11 @@ } } + // stub + final List createCompoundFile(String fileName, SegmentInfo info) { + return null; + } + final List createCompoundFile(String fileName) throws IOException { CompoundFileWriter cfsWriter = @@ -553,6 +565,11 @@ } } + // stub + Codec getCodec() { + return null; + } + private SegmentMergeQueue queue = null; private final void mergeTerms() throws CorruptIndexException, IOException { Index: backwards/src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- backwards/src/java/org/apache/lucene/index/SegmentReader.java (revision 931099) +++ backwards/src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -37,6 +37,7 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.index.codecs.CodecProvider; /** @version $Id */ /** @@ -594,6 +595,17 @@ return instance; } + // stub + public static SegmentReader get(boolean readOnly, + Directory dir, + SegmentInfo si, + int readBufferSize, + boolean doOpenStores, + int termInfosIndexDivisor, + CodecProvider codecs) { + return null; + } + void openDocStores() throws IOException { core.openDocStores(si); } Property changes on: backwards\src\java\org\apache\lucene\search\MultiTermQueryWrapperFilter.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:r880869-912395 Index: backwards/src/java/org/apache/lucene/store/IndexInput.java =================================================================== --- backwards/src/java/org/apache/lucene/store/IndexInput.java (revision 931099) +++ backwards/src/java/org/apache/lucene/store/IndexInput.java (working copy) @@ -17,180 +17,14 @@ * limitations under the License. */ -import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import java.util.HashMap; +import java.io.IOException; /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ -public abstract class IndexInput implements Cloneable,Closeable { - private byte[] bytes; // used by readString() - private char[] chars; // used by readModifiedUTF8String() - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - - /** Reads and returns a single byte. - * @see IndexOutput#writeByte(byte) - */ - public abstract byte readByte() throws IOException; - - /** Reads a specified number of bytes into an array at the specified offset. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @see IndexOutput#writeBytes(byte[],int) - */ - public abstract void readBytes(byte[] b, int offset, int len) - throws IOException; - - /** Reads a specified number of bytes into an array at the - * specified offset with control over whether the read - * should be buffered (callers who have their own buffer - * should pass in "false" for useBuffer). Currently only - * {@link BufferedIndexInput} respects this parameter. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @param useBuffer set to false if the caller will handle - * buffering. - * @see IndexOutput#writeBytes(byte[],int) - */ - public void readBytes(byte[] b, int offset, int len, boolean useBuffer) - throws IOException - { - // Default to ignoring useBuffer entirely - readBytes(b, offset, len); - } - - /** Reads four bytes and returns an int. - * @see IndexOutput#writeInt(int) - */ - public int readInt() throws IOException { - return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) - | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); - } - - /** Reads an int stored in variable-length format. Reads between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexOutput#writeVInt(int) - */ - public int readVInt() throws IOException { - byte b = readByte(); - int i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7F) << shift; - } - return i; - } - - /** Reads eight bytes and returns a long. - * @see IndexOutput#writeLong(long) - */ - public long readLong() throws IOException { - return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); - } - - /** Reads a long stored in variable-length format. Reads between one and - * nine bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. */ - public long readVLong() throws IOException { - byte b = readByte(); - long i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7FL) << shift; - } - return i; - } - - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - - /** Reads a string. - * @see IndexOutput#writeString(String) - */ - public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); - int length = readVInt(); - if (bytes == null || length > bytes.length) - bytes = new byte[(int) (length*1.25)]; - readBytes(bytes, 0, length); - return new String(bytes, 0, length, "UTF-8"); - } - - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - if (chars == null || length > chars.length) - chars = new char[length]; - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see IndexOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - - /** - * Expert - * - * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still - * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything - * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine - * how many more bytes to read - * @param length The number of chars to read - * @deprecated this method operates on old "modified utf8" encoded - * strings - */ - public void skipChars(int length) throws IOException{ - for (int i = 0; i < length; i++) { - byte b = readByte(); - if ((b & 0x80) == 0){ - //do nothing, we only need one byte - } - else if ((b & 0xE0) != 0xE0) { - readByte();//read an additional byte - } else{ - //read two additional bytes. - readByte(); - readByte(); - } - } - } - - +public abstract class IndexInput extends DataInput implements Cloneable,Closeable { /** Closes the stream to further operations. */ public abstract void close() throws IOException; @@ -207,38 +41,4 @@ /** The number of bytes in the file. */ public abstract long length(); - - /** Returns a clone of this stream. - * - *

Clones of a stream access the same data, and are positioned at the same - * point as the stream they were cloned from. - * - *

Expert: Subclasses must ensure that clones may be positioned at - * different points in the input from each other and from the stream they - * were cloned from. - */ - @Override - public Object clone() { - IndexInput clone = null; - try { - clone = (IndexInput)super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.bytes = null; - clone.chars = null; - - return clone; - } - - public Map readStringStringMap() throws IOException { - final Map map = new HashMap(); - final int count = readInt(); - for(int i=0;i> 24)); - writeByte((byte)(i >> 16)); - writeByte((byte)(i >> 8)); - writeByte((byte) i); - } - - /** Writes an int in a variable-length format. Writes between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVInt() - */ - public void writeVInt(int i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a long as eight bytes. - * @see IndexInput#readLong() - */ - public void writeLong(long i) throws IOException { - writeInt((int) (i >> 32)); - writeInt((int) i); - } - - /** Writes an long in a variable-length format. Writes between one and five - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVLong() - */ - public void writeVLong(long i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a string. - * @see IndexInput#readString() - */ - public void writeString(String s) throws IOException { - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - writeVInt(utf8Result.length); - writeBytes(utf8Result.result, 0, utf8Result.length); - } - - /** Writes a sub sequence of characters from s as the old - * format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes - * instead or use {@link #writeString} - */ - public void writeChars(String s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = (int)s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - /** Writes a sub sequence of characters from char[] as - * the old format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} - */ - public void writeChars(char[] s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = (int)s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - private static int COPY_BUFFER_SIZE = 16384; - private byte[] copyBuffer; - - /** Copy numBytes bytes from input to ourself. */ - public void copyBytes(IndexInput input, long numBytes) throws IOException { - assert numBytes >= 0: "numBytes=" + numBytes; - long left = numBytes; - if (copyBuffer == null) - copyBuffer = new byte[COPY_BUFFER_SIZE]; - while(left > 0) { - final int toCopy; - if (left > COPY_BUFFER_SIZE) - toCopy = COPY_BUFFER_SIZE; - else - toCopy = (int) left; - input.readBytes(copyBuffer, 0, toCopy); - writeBytes(copyBuffer, 0, toCopy); - left -= toCopy; - } - } - /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; @@ -208,17 +57,5 @@ * undefined. Otherwise the file is truncated. * @param length file length */ - public void setLength(long length) throws IOException {}; - - public void writeStringStringMap(Map map) throws IOException { - if (map == null) { - writeInt(0); - } else { - writeInt(map.size()); - for(final Map.Entry entry: map.entrySet()) { - writeString(entry.getKey()); - writeString(entry.getValue()); - } - } - } + public void setLength(long length) throws IOException {} } Index: backwards/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- backwards/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 931099) +++ backwards/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -106,6 +106,10 @@ } } + // stubs for tests only + public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) {} + public static void UTF16toUTF8(CharSequence s, int offset, int length, BytesRef result) {} + /** Encode characters from a char[] source, starting at * offset and stopping when the character 0xffff is seen. * Returns the number of bytes written to bytesOut. */ @@ -223,7 +227,7 @@ /** Encode characters from this String, starting at offset * for length characters. Returns the number of bytes * written to bytesOut. */ - public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) { + public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, UTF8Result result) { final int end = offset + length; byte[] out = result.result; Property changes on: backwards\src\test\org\apache\lucene\analysis\TestISOLatin1AccentFilter.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r880869-912395 Index: backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java =================================================================== --- backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 931099) +++ backwards/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.NumericUtils; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; - -public class TestNumericTokenStream extends BaseTokenStreamTestCase { - - static final long lvalue = 4573245871874382L; - static final int ivalue = 123456; - - public void testLongStream() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); - final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); - for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); - } - assertFalse("No more tokens available", stream.incrementToken()); - } - - public void testIntStream() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); - final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); - for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); - } - assertFalse("No more tokens available", stream.incrementToken()); - } - - public void testNotInitialized() throws Exception { - final NumericTokenStream stream=new NumericTokenStream(); - - try { - stream.reset(); - fail("reset() should not succeed."); - } catch (IllegalStateException e) { - // pass - } - - try { - stream.incrementToken(); - fail("incrementToken() should not succeed."); - } catch (IllegalStateException e) { - // pass - } - } - -} Index: backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java =================================================================== --- backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java (revision 931099) +++ backwards/src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java (working copy) @@ -107,10 +107,10 @@ char[] b = {'a', 'l', 'o', 'h', 'a'}; TermAttributeImpl t = new TermAttributeImpl(); t.setTermBuffer(b, 0, 5); - assertEquals("term=aloha", t.toString()); + assertEquals("aloha", t.toString()); t.setTermBuffer("hi there"); - assertEquals("term=hi there", t.toString()); + assertEquals("hi there", t.toString()); } public void testMixedStringArray() throws Exception { Property changes on: backwards\src\test\org\apache\lucene\document\TestDateTools.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/test/org/apache/lucene/document/TestDateTools.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/test/org/apache/lucene/document/TestDateTools.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/test/org/apache/lucene/document/TestDateTools.java:r880869-912395 Property changes on: backwards\src\test\org\apache\lucene\document\TestNumberTools.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/test/org/apache/lucene/document/TestNumberTools.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/test/org/apache/lucene/document/TestNumberTools.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/test/org/apache/lucene/document/TestNumberTools.java:r880869-912395 Property changes on: backwards\src\test\org\apache\lucene\index\TestBackwardsCompatibility.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r880869-912395 Index: backwards/src/test/org/apache/lucene/index/TestDoc.java =================================================================== --- backwards/src/test/org/apache/lucene/index/TestDoc.java (revision 931099) +++ backwards/src/test/org/apache/lucene/index/TestDoc.java (working copy) @@ -35,6 +35,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.index.codecs.CodecProvider; /** JUnit adaptation of an older test case DocTest. */ @@ -180,20 +181,24 @@ SegmentReader r1 = SegmentReader.get(true, si1, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); SegmentReader r2 = SegmentReader.get(true, si2, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); - SegmentMerger merger = new SegmentMerger(si1.dir, merged); + SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, merged, null, CodecProvider.getDefault()); merger.add(r1); merger.add(r2); merger.merge(); merger.closeReaders(); + final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, + useCompoundFile, true, -1, null, false, merger.hasProx(), + merger.getCodec()); + if (useCompoundFile) { - List filesToDelete = merger.createCompoundFile(merged + ".cfs"); + List filesToDelete = merger.createCompoundFile(merged + ".cfs", info); for (Iterator iter = filesToDelete.iterator(); iter.hasNext();) si1.dir.deleteFile((String) iter.next()); } - return new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, useCompoundFile, true); + return info; } Index: backwards/src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- backwards/src/test/org/apache/lucene/index/TestIndexReader.java (revision 931099) +++ backwards/src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -986,30 +986,8 @@ // new IndexFileDeleter, have it delete // unreferenced files, then verify that in fact // no files were deleted: - String[] startFiles = dir.listAll(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); - String[] endFiles = dir.listAll(); + TestIndexWriter.assertNoUnreferencedFiles(dir, "reader.close() failed to delete unreferenced files"); - Arrays.sort(startFiles); - Arrays.sort(endFiles); - - //for(int i=0;i= 0); } Index: backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 931099) +++ backwards/src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -56,14 +56,13 @@ SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) - { - int docId = segTermDocs.doc(); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -78,20 +77,20 @@ //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } Index: backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 931099) +++ backwards/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -61,23 +61,6 @@ verifyDocFreq(); } - public void testPrevTermAtEnd() throws IOException - { - Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - addDoc(writer, "aaa bbb"); - writer.close(); - SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); - } - private void verifyDocFreq() throws IOException { Index: backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 931099) +++ backwards/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -352,7 +352,7 @@ if (!termEnum1.next()) break; } - // iterate until we get some docs + // iterate until we get some docs int len2; for(;;) { len2=0; @@ -369,12 +369,12 @@ if (!termEnum2.next()) break; } + assertEquals(len1, len2); + if (len1==0) break; // no more terms + if (!hasDeletes) assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); - assertEquals(len1, len2); - if (len1==0) break; // no more terms - assertEquals(term1, term2); // sort info2 to get it into ascending docid Index: backwards/src/test/org/apache/lucene/search/CheckHits.java =================================================================== --- backwards/src/test/org/apache/lucene/search/CheckHits.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/CheckHits.java (working copy) @@ -33,7 +33,7 @@ * different order of operations from the actual scoring method ... * this allows for a small amount of variation */ - public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.00005f; + public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f; /** * Tests that all documents up to maxDoc which are *not* in the Index: backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java (working copy) @@ -65,7 +65,7 @@ if (originalSet.isCacheable()) { assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass()); } else { - assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI); + assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI || cachedSet == DocIdSet.EMPTY_DOCIDSET); } } Index: backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy) @@ -230,6 +230,8 @@ testRightOpenRange(2); } + /* TESTs disabled, because incompatible API change in 3.1/flex: + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="field"+precisionStep; @@ -298,6 +300,8 @@ testRandomTrieAndClassicRangeQuery(Integer.MAX_VALUE); } + */ + private void testRangeSplit(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="ascfield"+precisionStep; @@ -443,37 +447,39 @@ assertFalse(q2.equals(q1)); } - private void testEnum(int lower, int upper) throws Exception { - NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); - FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); - try { - int count = 0; - do { - final Term t = termEnum.term(); - if (t != null) { - final int val = NumericUtils.prefixCodedToInt(t.text()); - assertTrue("value not in bounds", val >= lower && val <= upper); - count++; - } else break; - } while (termEnum.next()); - assertFalse(termEnum.next()); - System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); - } finally { - termEnum.close(); - } - } +// Removed for now - NumericRangeQuery does not currently implement getEnum - public void testEnum() throws Exception { - int count=3000; - int lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3); - // test enum with values - testEnum(lower, upper); - // test empty enum - testEnum(upper, lower); - // test empty enum outside of bounds - lower = distance*noDocs+startOffset; - upper = 2 * lower; - testEnum(lower, upper); - } +// private void testEnum(int lower, int upper) throws Exception { +// NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); +// FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); +// try { +// int count = 0; +// do { +// final Term t = termEnum.term(); +// if (t != null) { +// final int val = NumericUtils.prefixCodedToInt(t.text()); +// assertTrue("value not in bounds", val >= lower && val <= upper); +// count++; +// } else break; +// } while (termEnum.next()); +// assertFalse(termEnum.next()); +// System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); +// } finally { +// termEnum.close(); +// } +// } +// +// public void testEnum() throws Exception { +// int count=3000; +// int lower=(distance*3/2)+startOffset, upper=lower + count*distance + (distance/3); +// // test enum with values +// testEnum(lower, upper); +// // test empty enum +// testEnum(upper, lower); +// // test empty enum outside of bounds +// lower = distance*noDocs+startOffset; +// upper = 2 * lower; +// testEnum(lower, upper); +// } } Index: backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (working copy) @@ -245,6 +245,8 @@ testRightOpenRange(2); } + /* TESTs disabled, because incompatible API change in 3.1/flex: + private void testRandomTrieAndClassicRangeQuery(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="field"+precisionStep; @@ -317,6 +319,8 @@ testRandomTrieAndClassicRangeQuery(Integer.MAX_VALUE); } + */ + private void testRangeSplit(int precisionStep) throws Exception { final Random rnd=newRandom(); String field="ascfield"+precisionStep; Index: backwards/src/test/org/apache/lucene/search/TestSort.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestSort.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -35,6 +35,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.LockObtainFailedException; @@ -332,20 +333,28 @@ FieldCache fc = FieldCache.DEFAULT; - sort.setSort (new SortField ("parser", new FieldCache.IntParser(){ - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + sort.setSort ( new SortField ("parser", new FieldCache.IntParser(){ + public final int parseInt(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; + } + }), SortField.FIELD_DOC); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " IntParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.FloatParser(){ - public final float parseFloat(final String val) { - return (float) Math.sqrt( val.charAt(0) ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.FloatParser(){ + public final float parseFloat(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final float parseFloat(final BytesRef term) { + return (float) Math.sqrt( term.bytes[term.offset] ); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " FloatParser"); fc.purgeAllCaches(); @@ -354,34 +363,49 @@ public final long parseLong(final String val) { return (val.charAt(0)-'A') * 1234567890L; } - }), SortField.FIELD_DOC ); + public final long parseLong(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 1234567890L; + } + }), SortField.FIELD_DOC); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " LongParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.DoubleParser(){ - public final double parseDouble(final String val) { - return Math.pow( val.charAt(0), (val.charAt(0)-'A') ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.DoubleParser(){ + public final double parseDouble(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final double parseDouble(final BytesRef term) { + return Math.pow( term.bytes[term.offset], (term.bytes[term.offset]-'A') ); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " DoubleParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ByteParser(){ - public final byte parseByte(final String val) { - return (byte) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ByteParser(){ + public final byte parseByte(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final byte parseByte(final BytesRef term) { + return (byte) (term.bytes[term.offset]-'A'); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ByteParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ShortParser(){ - public final short parseShort(final String val) { - return (short) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ShortParser(){ + public final short parseShort(final String term) { + // dummy + return 0; } - }), SortField.FIELD_DOC ); + public final short parseShort(final BytesRef term) { + return (short) (term.bytes[term.offset]-'A'); + } + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ShortParser"); fc.purgeAllCaches(); @@ -439,9 +463,13 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(final String term) { + // dummy + return 0; } + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; + } }); } Index: backwards/src/test/org/apache/lucene/search/TestTermScorer.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestTermScorer.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestTermScorer.java (working copy) @@ -72,9 +72,9 @@ Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + //we have 2 documents with the term all in them, one document for all the other values final List docs = new ArrayList(); //must call next first @@ -138,9 +138,9 @@ Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue("score is not correct", ts.score() == 1.6931472f); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); @@ -155,9 +155,9 @@ Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); + assertTrue("Didn't skip", ts.advance(3) != DocIdSetIterator.NO_MORE_DOCS); //The next doc should be doc 5 assertTrue("doc should be number 5", ts.docID() == 5); Index: backwards/src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- backwards/src/test/org/apache/lucene/search/TestWildcard.java (revision 931099) +++ backwards/src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -114,6 +114,7 @@ * rewritten to a single PrefixQuery. The boost and rewriteMethod should be * preserved. */ + /* disable because rewrites changed in flex/trunk public void testPrefixTerm() throws IOException { RAMDirectory indexStore = getIndexStore("field", new String[]{"prefix", "prefixx"}); IndexSearcher searcher = new IndexSearcher(indexStore, true); @@ -145,7 +146,7 @@ expected.setRewriteMethod(wq.getRewriteMethod()); expected.setBoost(wq.getBoost()); assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - } + }*/ /** * Tests Wildcard queries with an asterisk. Index: backwards/src/test/org/apache/lucene/util/TestAttributeSource.java =================================================================== --- backwards/src/test/org/apache/lucene/util/TestAttributeSource.java (revision 931099) +++ backwards/src/test/org/apache/lucene/util/TestAttributeSource.java (working copy) @@ -78,22 +78,22 @@ public void testCloneAttributes() { final AttributeSource src = new AttributeSource(); - final TermAttribute termAtt = src.addAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt = src.addAttribute(FlagsAttribute.class); final TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + flagsAtt.setFlags(1234); typeAtt.setType("TestType"); final AttributeSource clone = src.cloneAttributes(); final Iterator> it = clone.getAttributeClassesIterator(); - assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("FlagsAttribute must be the first attribute", FlagsAttribute.class, it.next()); assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); assertFalse("No more attributes", it.hasNext()); - final TermAttribute termAtt2 = clone.getAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt2 = clone.getAttribute(FlagsAttribute.class); final TypeAttribute typeAtt2 = clone.getAttribute(TypeAttribute.class); - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); } Property changes on: backwards\src\test\org\apache\lucene\util\TestAttributeSource.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/backwards/src/test/org/apache/lucene/util/TestAttributeSource.java:r824912-931101 Merged /lucene/java/branches/lucene_3_0_back_compat_tests/src/test/org/apache/lucene/util/TestAttributeSource.java:r880869-912395 Index: backwards/src/test/org/apache/lucene/util/TestNumericUtils.java =================================================================== --- backwards/src/test/org/apache/lucene/util/TestNumericUtils.java (revision 931099) +++ backwards/src/test/org/apache/lucene/util/TestNumericUtils.java (working copy) @@ -26,6 +26,8 @@ public class TestNumericUtils extends LuceneTestCase { + /* TESTs disabled, because incompatible API change in 3.1/flex: + public void testLongConversionAndOrdering() throws Exception { // generate a series of encoded longs, each numerical one bigger than the one before String last=null; @@ -131,6 +133,8 @@ } } } + + */ public void testDoubles() throws Exception { double[] vals=new double[]{ Index: build.xml =================================================================== --- build.xml (revision 931099) +++ build.xml (working copy) @@ -104,24 +104,24 @@ - + - - + - - + + - + @@ -715,6 +715,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Property changes on: build.xml ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/branches/lucene_2_9/build.xml:r896850 Reverse-merged /lucene/java/trunk/build.xml:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/build.xml:r824912-931101 Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 931099) +++ CHANGES.txt (working copy) @@ -1,5 +1,79 @@ Lucene Change Log +======================= Flexible Indexing Branch ======================= + +Changes in backwards compatibility policy + +* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing: + + - MultiReader ctor now throws IOException + + - Directory.copy/Directory.copyTo now copies all files (not just + index files), since what is and isn't and index file is now + dependent on the codecs used. (Mike McCandless) + + - UnicodeUtil now uses BytesRef for UTF-8 output, and some method + signatures have changed to CharSequence. These are internal APIs + and subject to change suddenly. (Robert Muir, Mike McCandless) + + - Positional queries (PhraseQuery, *SpanQuery) will now throw an + exception if use them on a field that omits positions during + indexing (previously they silently returned no results). + + - FieldCache.(Byte,Short,Int,Long,Float,Double}Parser's API has + changed -- each parse method now takes a BytesRef instead of a + String. If you have an existing Parser, a simple way to fix it is + invoke BytesRef.utf8ToString, and pass that String to your + existing parser. This will work, but performance would be better + if you could fix your parser to instead operate directly on the + byte[] in the BytesRef. + + - The internal (experimental) API of NumericUtils changed completely + from String to BytesRef. Client code should never use this class, + so the change would normally not affect you. If you used some of + the methods to inspect terms or create TermQueries out of + prefix encoded terms, change to use BytesRef. Please note: + Do not use TermQueries to search for single numeric terms. + The recommended way is to create a corresponding NumericRangeQuery + with upper and lower bound equal and included. TermQueries do not + score correct, so the constant score mode of NRQ is the only + correct way to handle single value queries. + + - NumericTokenStream now works directly on byte[] terms. If you + plug a TokenFilter on top of this stream, you will likely get + an IllegalArgumentException, because the NTS does not support + TermAttribute/CharTermAttribute. If you want to further filter + or attach Payloads to NTS, use the new NumericTermAttribute. + +Bug Fixes + +* LUCENE-2222: FixedIntBlockIndexInput incorrectly read one block of + 0s before the actual data. (Renaud Delbru via Mike McCandless) + +* LUCENE-2344: PostingsConsumer.merge was failing to call finishDoc, + which caused corruption for sep codec. Also fixed several tests to + test all 4 core codecs. (Renaud Delbru via Mike McCandless) + +New features + +* LUCENE-1606, LUCENE-2089: Adds AutomatonQuery, a MultiTermQuery that + matches terms against a finite-state machine. Implement WildcardQuery + and FuzzyQuery with finite-state methods. Adds RegexpQuery. + (Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller) + +* LUCENE-1990: Adds internal packed ints implementation, to be used + for more efficient storage of int arrays when the values are + bounded, for example for storing the terms dict index Toke Toke + Eskildsen via Mike McCandless) + +* LUCENE-2321: Cutover to a more RAM efficient packed-ints based + representation for the in-memory terms dict index. (Mike + McCandless) + +* LUCENE-2126: Add new classes for data (de)serialization: DataInput + and DataOutput. IndexInput and IndexOutput extend these new classes. + (Michael Busch) + ======================= Trunk (not yet released) ======================= Changes in backwards compatibility policy @@ -297,8 +371,8 @@ Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation - into core, and moved the ICU-based collation support into contrib/icu. - (Robert Muir) + into core, and moved the ICU-based collation support into contrib/icu. + (Robert Muir) * LUCENE-2326: Removed SVN checkouts for backwards tests. The backwards branch is now included in the svn repository using "svn copy" after release. Property changes on: CHANGES.txt ___________________________________________________________________ Deleted: svn:mergeinfo Reverse-merged /lucene/java/trunk/CHANGES.txt:r924483-925561 Reverse-merged /lucene/java/branches/lucene_2_9/CHANGES.txt:r896850,909334 Index: common-build.xml =================================================================== --- common-build.xml (revision 931099) +++ common-build.xml (working copy) @@ -119,6 +119,11 @@ + + + + + Property changes on: contrib ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/contrib:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/contrib:r824912-931101 Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 931099) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -37,11 +37,12 @@ import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.TermFreqVector; @@ -474,16 +475,20 @@ IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory(), true); assertEquals(NUM_DOCS, reader.numDocs()); - TermEnum terms = reader.terms(); - TermDocs termDocs = reader.termDocs(); int totalTokenCount2 = 0; - while(terms.next()) { - Term term = terms.term(); - /* not-tokenized, but indexed field */ - if (term != null && term.field() != DocMaker.ID_FIELD) { - termDocs.seek(terms.term()); - while (termDocs.next()) - totalTokenCount2 += termDocs.freq(); + + FieldsEnum fields = MultiFields.getFields(reader).iterator(); + String fieldName = null; + while((fieldName = fields.next()) != null) { + if (fieldName == DocMaker.ID_FIELD) + continue; + TermsEnum terms = fields.terms(); + DocsEnum docs = null; + while(terms.next() != null) { + docs = terms.docs(MultiFields.getDeletedDocs(reader), docs); + while(docs.nextDoc() != docs.NO_MORE_DOCS) { + totalTokenCount2 += docs.freq(); + } } } reader.close(); Property changes on: contrib\CHANGES.txt ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/contrib/CHANGES.txt:r924732-924780,924782-925175,925463-925561 Reverse-merged /lucene/java/branches/lucene_2_9/contrib/CHANGES.txt:r909334 Merged /lucene/java/branches/flex_1458/contrib/CHANGES.txt:r824912-931101 Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 931099) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) @@ -150,11 +150,16 @@ mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); query = mtq; } - FakeReader fReader = new FakeReader(); - MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq); - if (fReader.field != null) { - IndexReader ir = getReaderForField(fReader.field); + if (mtq.getField() != null) { + IndexReader ir = getReaderForField(mtq.getField()); extract(query.rewrite(ir), terms); + } else { + FakeReader fReader = new FakeReader(); + MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq); + if (fReader.field != null) { + IndexReader ir = getReaderForField(fReader.field); + extract(query.rewrite(ir), terms); + } } } else if (query instanceof MultiPhraseQuery) { final MultiPhraseQuery mpq = (MultiPhraseQuery) query; Property changes on: contrib\highlighter\src\test ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/contrib/highlighter/src/test:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/contrib/highlighter/src/test:r824912-931101 Property changes on: contrib\instantiated\src\test\org\apache\lucene\store\instantiated\TestIndicesEquals.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/branches/lucene_2_9/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:r896850 Reverse-merged /lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:r824912-931101 Index: contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java (revision 931099) +++ contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java (working copy) @@ -19,11 +19,15 @@ import java.io.IOException; import java.io.File; import java.util.Date; +import java.util.List; +import java.util.ArrayList; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; /** * Given a directory and a list of fields, updates the fieldNorms in place for every document. @@ -104,46 +108,46 @@ */ public void reSetNorms(String field) throws IOException { String fieldName = StringHelper.intern(field); - int[] termCounts = new int[0]; IndexReader reader = null; - TermEnum termEnum = null; - TermDocs termDocs = null; try { - reader = IndexReader.open(dir, true); - termCounts = new int[reader.maxDoc()]; - try { - termEnum = reader.terms(new Term(field)); - try { - termDocs = reader.termDocs(); - do { - Term term = termEnum.term(); - if (term != null && term.field().equals(fieldName)) { - termDocs.seek(termEnum.term()); - while (termDocs.next()) { - termCounts[termDocs.doc()] += termDocs.freq(); + reader = IndexReader.open(dir, false); + + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + + for(IndexReader subReader : subReaders) { + final Bits delDocs = subReader.getDeletedDocs(); + + int[] termCounts = new int[subReader.maxDoc()]; + Fields fields = subReader.fields(); + if (fields != null) { + Terms terms = fields.terms(field); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(termsEnum.next() != null) { + docs = termsEnum.docs(delDocs, docs); + while(true) { + int docID = docs.nextDoc(); + if (docID != docs.NO_MORE_DOCS) { + termCounts[docID] += docs.freq(); + } else { + break; + } } } - } while (termEnum.next()); - - } finally { - if (null != termDocs) termDocs.close(); + } } - } finally { - if (null != termEnum) termEnum.close(); - } - } finally { - if (null != reader) reader.close(); - } - - try { - reader = IndexReader.open(dir, false); - for (int d = 0; d < termCounts.length; d++) { - if (! reader.isDeleted(d)) { - if (sim == null) - reader.setNorm(d, fieldName, Similarity.encodeNorm(1.0f)); - else - reader.setNorm(d, fieldName, sim.encodeNormValue(sim.lengthNorm(fieldName, termCounts[d]))); + + for (int d = 0; d < termCounts.length; d++) { + if (delDocs == null || !delDocs.get(d)) { + if (sim == null) { + subReader.setNorm(d, fieldName, Similarity.encodeNorm(1.0f)); + } else { + subReader.setNorm(d, fieldName, sim.encodeNormValue(sim.lengthNorm(fieldName, termCounts[d]))); + } + } } } @@ -151,5 +155,4 @@ if (null != reader) reader.close(); } } - } Index: contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java (revision 931099) +++ contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; /** @@ -172,6 +173,8 @@ * list of deletions. */ public static class FakeDeleteIndexReader extends FilterIndexReader { + // TODO: switch to flex api, here + OpenBitSet dels; OpenBitSet oldDels = null; @@ -202,6 +205,7 @@ if (oldDels != null) { dels.or(oldDels); } + storeDelDocs(null); } @Override @@ -215,6 +219,16 @@ } @Override + public IndexReader[] getSequentialSubReaders() { + return null; + } + + @Override + public Bits getDeletedDocs() { + return dels; + } + + @Override public boolean isDeleted(int n) { return dels.get(n); } @@ -235,5 +249,29 @@ } }; } + + @Override + public TermDocs termDocs() throws IOException { + return new FilterTermDocs(in.termDocs()) { + + @Override + public boolean next() throws IOException { + boolean res; + while ((res = super.next())) { + if (!dels.get(doc())) { + break; + } + } + return res; + } + }; + } + + @Override + public TermDocs termDocs(Term term) throws IOException { + TermDocs termDocs = termDocs(); + termDocs.seek(term); + return termDocs; + } } } Index: contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java (revision 931099) +++ contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java (working copy) @@ -1,10 +1,5 @@ package org.apache.lucene.index; -import org.apache.lucene.util.StringHelper; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; /* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +15,15 @@ * */ +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + + /** * Transparent access to the vector space model, * either via TermFreqVector or by resolving it from the inverted index. @@ -97,40 +100,53 @@ positions.clear(); } - TermEnum termEnum = indexReader.terms(new Term(field, "")); - if (termEnum.term() != null) { - while (termEnum.term().field() == field) { - TermPositions termPositions = indexReader.termPositions(termEnum.term()); - if (termPositions.skipTo(documentNumber)) { - - frequencies.add(Integer.valueOf(termPositions.freq())); - tokens.add(termEnum.term().text()); - - + final Bits delDocs = MultiFields.getDeletedDocs(indexReader); + + Terms terms = MultiFields.getTerms(indexReader, field); + boolean anyTerms = false; + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + DocsAndPositionsEnum postings = null; + while(true) { + BytesRef text = termsEnum.next(); + if (text != null) { + anyTerms = true; if (!mapper.isIgnoringPositions()) { - int[] positions = new int[termPositions.freq()]; - for (int i = 0; i < positions.length; i++) { - positions[i] = termPositions.nextPosition(); - } - this.positions.add(positions); + docs = postings = termsEnum.docsAndPositions(delDocs, postings); } else { - positions.add(null); + docs = termsEnum.docs(delDocs, docs); } - } - termPositions.close(); - if (!termEnum.next()) { + + int docID = docs.advance(documentNumber); + if (docID == documentNumber) { + + frequencies.add(Integer.valueOf(docs.freq())); + tokens.add(text.utf8ToString()); + + if (!mapper.isIgnoringPositions()) { + int[] positions = new int[docs.freq()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = postings.nextPosition(); + } + this.positions.add(positions); + } else { + positions.add(null); + } + } + } else { break; } } - mapper.setDocumentNumber(documentNumber); - mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); - for (int i = 0; i < tokens.size(); i++) { - mapper.map(tokens.get(i), frequencies.get(i).intValue(), (TermVectorOffsetInfo[]) null, positions.get(i)); + + if (anyTerms) { + mapper.setDocumentNumber(documentNumber); + mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); + for (int i = 0; i < tokens.size(); i++) { + mapper.map(tokens.get(i), frequencies.get(i).intValue(), (TermVectorOffsetInfo[]) null, positions.get(i)); + } } } - termEnum.close(); - - } Index: contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (revision 931099) +++ contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (working copy) @@ -18,7 +18,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.Terms; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.PriorityQueue; @@ -50,20 +53,40 @@ } TermInfoQueue tiq = new TermInfoQueue(numTerms); - TermEnum terms = reader.terms(); if (field != null) { - while (terms.next()) { - if (terms.term().field().equals(field)) { - tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq())); + Terms terms = reader.fields().terms(field); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + while(true) { + BytesRef term = termsEnum.next(); + if (term != null) { + tiq.insertWithOverflow(new TermInfo(new Term(field, term.utf8ToString()), termsEnum.docFreq())); + } else { + break; + } } } - } - else { - while (terms.next()) { - tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq())); + } else { + FieldsEnum fields = reader.fields().iterator(); + while(true) { + field = fields.next(); + if (field != null) { + TermsEnum terms = fields.terms(); + while(true) { + BytesRef term = terms.next(); + if (term != null) { + tiq.insertWithOverflow(new TermInfo(new Term(field, term.toString()), terms.docFreq())); + } else { + break; + } + } + } else { + break; + } } } + while (tiq.size() != 0) { TermInfo termInfo = tiq.pop(); System.out.println(termInfo.term + " " + termInfo.docFreq); Index: contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java =================================================================== --- contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java (revision 931099) +++ contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java (working copy) @@ -76,13 +76,9 @@ writer.close(); } - public void testMissingField() { + public void testMissingField() throws Exception { FieldNormModifier fnm = new FieldNormModifier(store, s); - try { - fnm.reSetNorms("nobodyherebutuschickens"); - } catch (Exception e) { - assertNull("caught something", e); - } + fnm.reSetNorms("nobodyherebutuschickens"); } public void testFieldWithNoNorm() throws Exception { @@ -97,11 +93,7 @@ r.close(); FieldNormModifier fnm = new FieldNormModifier(store, s); - try { - fnm.reSetNorms("nonorm"); - } catch (Exception e) { - assertNull("caught something", e); - } + fnm.reSetNorms("nonorm"); // nothing should have changed r = IndexReader.open(store, false); Index: contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (revision 931099) +++ contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (working copy) @@ -18,10 +18,13 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; public class DuplicateFilter extends Filter { @@ -79,89 +82,88 @@ } } - private OpenBitSet correctBits(IndexReader reader) throws IOException - { - - OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - int lastDoc=-1; - //set non duplicates - TermDocs td = reader.termDocs(currTerm); - if(td.next()) - { - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - bits.set(td.doc()); - } - else - { - do - { - lastDoc=td.doc(); - }while(td.next()); - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } + private OpenBitSet correctBits(IndexReader reader) throws IOException { + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); //assume all are INvalid + final Bits delDocs = MultiFields.getDeletedDocs(reader); + Terms terms = reader.fields().terms(fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != docs.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { + bits.set(doc); + } else { + int lastDoc = doc; + while (true) { + lastDoc = doc; + doc = docs.nextDoc(); + if (doc == docs.NO_MORE_DOCS) { + break; + } + } + bits.set(lastDoc); + } + } + } + } + } + return bits; + } private OpenBitSet fastBits(IndexReader reader) throws IOException - { + { OpenBitSet bits=new OpenBitSet(reader.maxDoc()); - bits.set(0,reader.maxDoc()); //assume all are valid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - if(te.docFreq()>1) - { - int lastDoc=-1; - //unset potential duplicates - TermDocs td = reader.termDocs(currTerm); - td.next(); - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - td.next(); - } - do - { - lastDoc=td.doc(); - bits.clear(lastDoc); - }while(td.next()); - if(keepMode==KM_USE_LAST_OCCURRENCE) - { - //restore the last bit - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } + bits.set(0,reader.maxDoc()); //assume all are valid + final Bits delDocs = MultiFields.getDeletedDocs(reader); + Terms terms = reader.fields().terms(fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while(true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + if (termsEnum.docFreq() > 1) { + // unset potential duplicates + docs = termsEnum.docs(delDocs, docs); + int doc = docs.nextDoc(); + if (doc != docs.NO_MORE_DOCS) { + if (keepMode == KM_USE_FIRST_OCCURRENCE) { + doc = docs.nextDoc(); + } + } + + int lastDoc = -1; + while (true) { + lastDoc = doc; + bits.clear(lastDoc); + doc = docs.nextDoc(); + if (doc == docs.NO_MORE_DOCS) { + break; + } + } + if (keepMode==KM_USE_LAST_OCCURRENCE) { + // restore the last bit + bits.set(lastDoc); + } + } + } + } + } + + return bits; + } + public String getFieldName() { return fieldName; Index: contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (revision 931099) +++ contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (working copy) @@ -29,7 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** @@ -172,8 +172,8 @@ * Adds user input for "fuzzification" * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed * @param fieldName - * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum) - * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum) + * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermsEnum) + * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum) */ public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) { @@ -195,48 +195,44 @@ String term = termAtt.term(); if(!processedTerms.contains(term)) { - processedTerms.add(term); - ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term - float minScore=0; - Term startTerm=internSavingTemplateTerm.createTerm(term); - FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength); - TermEnum origEnum = reader.terms(startTerm); - int df=0; - if(startTerm.equals(origEnum.term())) - { - df=origEnum.docFreq(); //store the df so all variants use same idf - } - int numVariants=0; - int totalVariantDocFreqs=0; - do - { - Term possibleMatch=fe.term(); - if(possibleMatch!=null) - { - numVariants++; - totalVariantDocFreqs+=fe.docFreq(); - float score=fe.difference(); - if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm); - variantsQ.insertWithOverflow(st); - minScore = variantsQ.top().score; // maintain minScore - } + processedTerms.add(term); + ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term + float minScore=0; + Term startTerm=internSavingTemplateTerm.createTerm(term); + FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, startTerm, f.minSimilarity, f.prefixLength); + //store the df so all variants use same idf + int df = reader.docFreq(startTerm); + int numVariants=0; + int totalVariantDocFreqs=0; + BytesRef possibleMatch; + MultiTermQuery.BoostAttribute boostAtt = + fe.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + while ((possibleMatch = fe.next()) != null) { + if (possibleMatch!=null) { + numVariants++; + totalVariantDocFreqs+=fe.docFreq(); + float score=boostAtt.getBoost(); + if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ + ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), possibleMatch.utf8ToString()),score,startTerm); + variantsQ.insertWithOverflow(st); + minScore = variantsQ.top().score; // maintain minScore + } + } } - } - while(fe.next()); - if(numVariants>0) - { - int avgDf=totalVariantDocFreqs/numVariants; - if(df==0)//no direct match we can use as df for all variants + + if(numVariants>0) + { + int avgDf=totalVariantDocFreqs/numVariants; + if(df==0)//no direct match we can use as df for all variants { df=avgDf; //use avg df of all variants } - // take the top variants (scored by edit distance) and reset the score - // to include an IDF factor then add to the global queue for ranking - // overall top query terms - int size = variantsQ.size(); - for(int i = 0; i < size; i++) + // take the top variants (scored by edit distance) and reset the score + // to include an IDF factor then add to the global queue for ranking + // overall top query terms + int size = variantsQ.size(); + for(int i = 0; i < size; i++) { ScoreTerm st = variantsQ.pop(); st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs); Index: contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java =================================================================== --- contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java (revision 931099) +++ contrib/remote/src/test/org/apache/lucene/search/TestRemoteSort.java (working copy) @@ -38,6 +38,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -219,8 +220,8 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(BytesRef termRef) { + return (termRef.utf8ToString().charAt(0)-'A') * 123456; } }); } @@ -245,6 +246,29 @@ runMultiSorts(multi, true); // this runs on the full index } + // test custom search when remote + /* rewrite with new API + public void testRemoteCustomSort() throws Exception { + Searchable searcher = getRemote(); + MultiSearcher multi = new MultiSearcher (new Searchable[] { searcher }); + sort.setSort (new SortField ("custom", SampleComparable.getComparatorSource())); + assertMatches (multi, queryX, sort, "CAIEG"); + sort.setSort (new SortField ("custom", SampleComparable.getComparatorSource(), true)); + assertMatches (multi, queryY, sort, "HJDBF"); + + assertSaneFieldCaches(getName() + " ComparatorSource"); + FieldCache.DEFAULT.purgeAllCaches(); + + SortComparator custom = SampleComparable.getComparator(); + sort.setSort (new SortField ("custom", custom)); + assertMatches (multi, queryX, sort, "CAIEG"); + sort.setSort (new SortField ("custom", custom, true)); + assertMatches (multi, queryY, sort, "HJDBF"); + + assertSaneFieldCaches(getName() + " Comparator"); + FieldCache.DEFAULT.purgeAllCaches(); + }*/ + // test that the relevancy scores are the same even if // hits are sorted public void testNormalizedScores() throws Exception { @@ -289,12 +313,12 @@ assertSameValues (scoresY, getScores (remote.search (queryY, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresA, getScores (remote.search (queryA, null, 1000, sort).scoreDocs, remote)); - sort.setSort (new SortField ("int", SortField.INT, true), new SortField (null, SortField.DOC, true) ); + sort.setSort (new SortField[] { new SortField ("int", SortField.INT, true), new SortField (null, SortField.DOC, true) }); assertSameValues (scoresX, getScores (remote.search (queryX, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresY, getScores (remote.search (queryY, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresA, getScores (remote.search (queryA, null, 1000, sort).scoreDocs, remote)); - sort.setSort (new SortField("float", SortField.FLOAT), new SortField("string", SortField.STRING)); + sort.setSort (new SortField("float", SortField.FLOAT)); assertSameValues (scoresX, getScores (remote.search (queryX, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresY, getScores (remote.search (queryY, null, 1000, sort).scoreDocs, remote)); assertSameValues (scoresA, getScores (remote.search (queryA, null, 1000, sort).scoreDocs, remote)); @@ -314,7 +338,11 @@ expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; assertMatches(multi, queryA, sort, expected); - sort.setSort(new SortField ("float", SortField.FLOAT), SortField.FIELD_DOC); + sort.setSort(new SortField ("int", SortField.INT)); + expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; + assertMatches(multi, queryA, sort, expected); + + sort.setSort(new SortField[] {new SortField ("float", SortField.FLOAT), SortField.FIELD_DOC}); assertMatches(multi, queryA, sort, "GDHJCIEFAB"); sort.setSort(new SortField("float", SortField.FLOAT)); Index: contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java =================================================================== --- contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java (revision 931099) +++ contrib/spatial/src/java/org/apache/lucene/spatial/tier/CartesianShapeFilter.java (working copy) @@ -19,12 +19,15 @@ import java.io.IOException; import java.util.List; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.Filter; import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.OpenBitSet; /** @@ -44,22 +47,41 @@ @Override public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { - final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); - final TermDocs termDocs = reader.termDocs(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); final List area = shape.getArea(); - int sz = area.size(); + final int sz = area.size(); - final Term term = new Term(fieldName); // iterate through each boxid - for (int i =0; i< sz; i++) { - double boxId = area.get(i).doubleValue(); - termDocs.seek(term.createTerm(NumericUtils.doubleToPrefixCoded(boxId))); - // iterate through all documents - // which have this boxId - while (termDocs.next()) { - bits.fastSet(termDocs.doc()); + final BytesRef bytesRef = new BytesRef(NumericUtils.BUF_SIZE_LONG); + if (sz == 1) { + double boxId = area.get(0).doubleValue(); + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(boxId), 0, bytesRef); + return new DocIdSet() { + @Override + public DocIdSetIterator iterator() throws IOException { + return MultiFields.getTermDocsEnum(reader, delDocs, fieldName, bytesRef); + } + + @Override + public boolean isCacheable() { + return false; + } + }; + } else { + final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + for (int i =0; i< sz; i++) { + double boxId = area.get(i).doubleValue(); + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(boxId), 0, bytesRef); + final DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, delDocs, fieldName, bytesRef); + if (docsEnum == null) continue; + // iterate through all documents + // which have this boxId + int doc; + while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + bits.fastSet(doc); + } } + return bits; } - return bits; } } Index: contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java =================================================================== --- contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java (revision 931099) +++ contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestCartesian.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -49,7 +50,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; public class TestCartesian extends LuceneTestCase { @@ -96,8 +96,8 @@ doc.add(new Field("name", name,Field.Store.YES, Field.Index.ANALYZED)); // convert the lat / long to lucene fields - doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat),Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng),Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new NumericField(latField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lat)); + doc.add(new NumericField(lngField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lng)); // add a default meta field to make searching all documents easy doc.add(new Field("metafile", "doc",Field.Store.YES, Field.Index.ANALYZED)); @@ -105,10 +105,9 @@ int ctpsize = ctps.size(); for (int i =0; i < ctpsize; i++){ CartesianTierPlotter ctp = ctps.get(i); - doc.add(new Field(ctp.getTierFieldName(), - NumericUtils.doubleToPrefixCoded(ctp.getTierBoxId(lat,lng)), + doc.add(new NumericField(ctp.getTierFieldName(), Integer.MAX_VALUE, Field.Store.YES, - Field.Index.NOT_ANALYZED_NO_NORMS)); + true).setDoubleValue(ctp.getTierBoxId(lat,lng))); doc.add(new Field(geoHashPrefix, GeoHashUtils.encode(lat,lng), Field.Store.YES, @@ -275,8 +274,8 @@ Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -369,8 +368,8 @@ for(int i =0 ; i < results; i++){ Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -464,8 +463,8 @@ Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); @@ -558,8 +557,8 @@ Document d = searcher.doc(scoreDocs[i].doc); String name = d.get("name"); - double rsLat = NumericUtils.prefixCodedToDouble(d.get(latField)); - double rsLng = NumericUtils.prefixCodedToDouble(d.get(lngField)); + double rsLat = Double.parseDouble(d.get(latField)); + double rsLng = Double.parseDouble(d.get(lngField)); Double geo_distance = distances.get(scoreDocs[i].doc); double distance = DistanceUtils.getInstance().getDistanceMi(lat, lng, rsLat, rsLng); Index: contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java =================================================================== --- contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java (revision 931099) +++ contrib/spatial/src/test/org/apache/lucene/spatial/tier/TestDistance.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; @@ -28,7 +29,6 @@ import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; import org.apache.lucene.store.RAMDirectory; public class TestDistance extends LuceneTestCase { @@ -63,8 +63,8 @@ doc.add(new Field("name", name,Field.Store.YES, Field.Index.ANALYZED)); // convert the lat / long to lucene fields - doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat),Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng),Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new NumericField(latField, Integer.MAX_VALUE, Field.Store.YES, true).setDoubleValue(lat)); + doc.add(new NumericField(lngField, Integer.MAX_VALUE,Field.Store.YES, true).setDoubleValue(lng)); // add a default meta field to make searching all documents easy doc.add(new Field("metafile", "doc",Field.Store.YES, Field.Index.ANALYZED)); Index: contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java =================================================================== --- contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (revision 931099) +++ contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (working copy) @@ -21,8 +21,10 @@ import java.util.Iterator; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.StringHelper; import java.io.*; @@ -46,61 +48,45 @@ this.field = StringHelper.intern(field); } - public final Iterator getWordsIterator() { + public final Iterator getWordsIterator() { return new LuceneIterator(); } - final class LuceneIterator implements Iterator { - private TermEnum termEnum; - private Term actualTerm; - private boolean hasNextCalled; + final class LuceneIterator implements Iterator { + private TermsEnum termsEnum; + private BytesRef pendingTerm; LuceneIterator() { try { - termEnum = reader.terms(new Term(field)); + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + termsEnum = terms.iterator(); + pendingTerm = termsEnum.next(); + } } catch (IOException e) { throw new RuntimeException(e); } } - public String next() { - if (!hasNextCalled) { - hasNext(); + public Object next() { + if (pendingTerm == null) { + return null; } - hasNextCalled = false; + String result = pendingTerm.utf8ToString(); + try { - termEnum.next(); + pendingTerm = termsEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } - return (actualTerm != null) ? actualTerm.text() : null; + return result; } public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; - } - hasNextCalled = true; - - actualTerm = termEnum.term(); - - // if there are no words return false - if (actualTerm == null) { - return false; - } - - String currentField = actualTerm.field(); - - // if the next word doesn't have the same field return false - if (currentField != field) { - actualTerm = null; - return false; - } - - return true; + return pendingTerm != null; } public void remove() { Index: contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java =================================================================== --- contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (revision 931099) +++ contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (working copy) @@ -17,16 +17,21 @@ */ import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import java.io.IOException; public class SrndPrefixQuery extends SimpleTerm { + private final BytesRef prefixRef; public SrndPrefixQuery(String prefix, boolean quoted, char truncator) { super(quoted); this.prefix = prefix; + prefixRef = new BytesRef(prefix); this.truncator = truncator; } @@ -53,20 +58,35 @@ MatchingTermVisitor mtv) throws IOException { /* inspired by PrefixQuery.rewrite(): */ - TermEnum enumerator = reader.terms(getLucenePrefixTerm(fieldName)); - try { - do { - Term term = enumerator.term(); - if ((term != null) - && term.text().startsWith(getPrefix()) - && term.field().equals(fieldName)) { - mtv.visitMatchingTerm(term); + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + + boolean skip = false; + TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getPrefix())); + if (status == TermsEnum.SeekStatus.FOUND) { + mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); + } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { + if (termsEnum.term().startsWith(prefixRef)) { + mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString())); } else { - break; + skip = true; } - } while (enumerator.next()); - } finally { - enumerator.close(); + } else { + // EOF + skip = true; + } + + if (!skip) { + while(true) { + BytesRef text = termsEnum.next(); + if (text != null && text.startsWith(prefixRef)) { + mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString())); + } else { + break; + } + } + } } } } Index: contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java =================================================================== --- contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java (revision 931099) +++ contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java (working copy) @@ -20,7 +20,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; public class SrndTermQuery extends SimpleTerm { @@ -46,16 +49,14 @@ MatchingTermVisitor mtv) throws IOException { /* check term presence in index here for symmetry with other SimpleTerm's */ - TermEnum enumerator = reader.terms(getLuceneTerm(fieldName)); - try { - Term it= enumerator.term(); /* same or following index term */ - if ((it != null) - && it.text().equals(getTermText()) - && it.field().equals(fieldName)) { - mtv.visitMatchingTerm(it); + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + + TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getTermText())); + if (status == TermsEnum.SeekStatus.FOUND) { + mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } - } finally { - enumerator.close(); } } } Index: contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java =================================================================== --- contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (revision 931099) +++ contrib/surround/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (working copy) @@ -17,8 +17,11 @@ */ import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import java.io.IOException; @@ -40,6 +43,7 @@ private final char mask; private String prefix; + private BytesRef prefixRef; private Pattern pattern; @@ -68,6 +72,7 @@ i++; } prefix = truncated.substring(0, i); + prefixRef = new BytesRef(prefix); StringBuilder re = new StringBuilder(); while (i < truncated.length()) { @@ -84,26 +89,37 @@ MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); - TermEnum enumerator = reader.terms(new Term(fieldName, prefix)); - Matcher matcher = pattern.matcher(""); - try { - do { - Term term = enumerator.term(); - if (term != null) { - String text = term.text(); - if ((! text.startsWith(prefix)) || (! term.field().equals(fieldName))) { - break; - } else { - matcher.reset( text.substring(prefixLength)); + Terms terms = MultiFields.getTerms(reader, fieldName); + if (terms != null) { + Matcher matcher = pattern.matcher(""); + try { + TermsEnum termsEnum = terms.iterator(); + + TermsEnum.SeekStatus status = termsEnum.seek(prefixRef); + BytesRef text; + if (status == TermsEnum.SeekStatus.FOUND) { + text = prefixRef; + } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { + text = termsEnum.term(); + } else { + text = null; + } + + while(text != null) { + if (text != null && text.startsWith(prefixRef)) { + String textString = text.utf8ToString(); + matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { - mtv.visitMatchingTerm(term); + mtv.visitMatchingTerm(new Term(fieldName, textString)); } + } else { + break; } + text = termsEnum.next(); } - } while (enumerator.next()); - } finally { - enumerator.close(); - matcher.reset(); + } finally { + matcher.reset(); + } } } } Index: docs/index.pdf =================================================================== --- docs/index.pdf (revision 931099) +++ docs/index.pdf (working copy) @@ -1,5 +1,5 @@ %PDF-1.3 -%ª«¬­ +%���� 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> Index: docs/linkmap.pdf =================================================================== --- docs/linkmap.pdf (revision 931099) +++ docs/linkmap.pdf (working copy) @@ -1,5 +1,5 @@ %PDF-1.3 -%ª«¬­ +%���� 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> Index: docs/scoring.pdf =================================================================== --- docs/scoring.pdf (revision 931099) +++ docs/scoring.pdf (working copy) @@ -1,5 +1,5 @@ %PDF-1.3 -%ª«¬­ +%���� 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> Index: LICENSE.txt =================================================================== --- LICENSE.txt (revision 931099) +++ LICENSE.txt (working copy) @@ -237,4 +237,60 @@ http://www.python.org/download/releases/2.4.2/license/ +Some code in src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. Index: NOTICE.txt =================================================================== --- NOTICE.txt (revision 931099) +++ NOTICE.txt (working copy) @@ -46,3 +46,12 @@ ICU4J, (under contrib/icu) is licensed under an MIT styles license (contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Brics Automaton (under src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ Index: src/java/org/apache/lucene/analysis/NumericTokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/NumericTokenStream.java (revision 931099) +++ src/java/org/apache/lucene/analysis/NumericTokenStream.java (working copy) @@ -17,12 +17,17 @@ * limitations under the License. */ +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -91,7 +96,89 @@ /** The lower precision tokens gets this token type assigned. */ public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; + + /** Expert: Use this attribute to get the details of the currently generated token + * @lucene.experimental + * @since 3.1 + */ + public interface NumericTermAttribute extends Attribute { + /** Returns current shift value, undefined before first token */ + int getShift(); + /** Returns {@link NumericTokenStream}'s raw value as {@code long} */ + long getRawValue(); + /** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */ + int getValueSize(); + } + + private static final class NumericAttributeFactory extends AttributeFactory { + private final AttributeFactory delegate; + private NumericTokenStream ts = null; + NumericAttributeFactory(AttributeFactory delegate) { + this.delegate = delegate; + } + + @Override + public AttributeImpl createAttributeInstance(Class attClass) { + if (attClass == NumericTermAttribute.class) + return new NumericTermAttributeImpl(ts); + if (attClass.isAssignableFrom(CharTermAttribute.class) || attClass.isAssignableFrom(TermAttribute.class)) + throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute/TermAttribute."); + return delegate.createAttributeInstance(attClass); + } + } + + private static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute { + private final NumericTokenStream ts; + + public NumericTermAttributeImpl(NumericTokenStream ts) { + this.ts = ts; + } + + public int toBytesRef(BytesRef bytes) { + try { + assert ts.valSize == 64 || ts.valSize == 32; + return (ts.valSize == 64) ? + NumericUtils.longToPrefixCoded(ts.value, ts.shift, bytes) : + NumericUtils.intToPrefixCoded((int) ts.value, ts.shift, bytes); + } catch (IllegalArgumentException iae) { + // return empty token before first + bytes.length = 0; + return 0; + } + } + + public int getShift() { return ts.shift; } + public long getRawValue() { return ts.value; } + public int getValueSize() { return ts.valSize; } + + @Override + public void clear() { + // this attribute has no contents to clear + } + + @Override + public boolean equals(Object other) { + return other == this; + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + + @Override + public void copyTo(AttributeImpl target) { + // this attribute has no contents to copy + } + + @Override + public Object clone() { + // cannot throw CloneNotSupportedException (checked) + throw new UnsupportedOperationException(); + } + } + /** * Creates a token stream for numeric values using the default precisionStep * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized, @@ -107,23 +194,15 @@ * before using set a value using the various set???Value() methods. */ public NumericTokenStream(final int precisionStep) { - super(); - this.precisionStep = precisionStep; - if (precisionStep < 1) - throw new IllegalArgumentException("precisionStep must be >=1"); - } + super(new NumericAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY)); + // we must do this after the super call :( + ((NumericAttributeFactory) getAttributeFactory()).ts = this; + addAttribute(NumericTermAttribute.class); - /** - * Expert: Creates a token stream for numeric values with the specified - * precisionStep using the given {@link AttributeSource}. - * The stream is not yet initialized, - * before using set a value using the various set???Value() methods. - */ - public NumericTokenStream(AttributeSource source, final int precisionStep) { - super(source); this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); + shift = -precisionStep; } /** @@ -134,10 +213,15 @@ * before using set a value using the various set???Value() methods. */ public NumericTokenStream(AttributeFactory factory, final int precisionStep) { - super(factory); + super(new NumericAttributeFactory(factory)); + // we must do this after the super call :( + ((NumericAttributeFactory) getAttributeFactory()).ts = this; + addAttribute(NumericTermAttribute.class); + this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); + shift = -precisionStep; } /** @@ -149,7 +233,7 @@ public NumericTokenStream setLongValue(final long value) { this.value = value; valSize = 64; - shift = 0; + shift = -precisionStep; return this; } @@ -162,7 +246,7 @@ public NumericTokenStream setIntValue(final int value) { this.value = value; valSize = 32; - shift = 0; + shift = -precisionStep; return this; } @@ -175,7 +259,7 @@ public NumericTokenStream setDoubleValue(final double value) { this.value = NumericUtils.doubleToSortableLong(value); valSize = 64; - shift = 0; + shift = -precisionStep; return this; } @@ -188,7 +272,7 @@ public NumericTokenStream setFloatValue(final float value) { this.value = NumericUtils.floatToSortableInt(value); valSize = 32; - shift = 0; + shift = -precisionStep; return this; } @@ -196,37 +280,24 @@ public void reset() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); - shift = 0; + shift = -precisionStep; } @Override public boolean incrementToken() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); - if (shift >= valSize) + shift += precisionStep; + if (shift >= valSize) { + // reset so the attribute still works after exhausted stream + shift -= precisionStep; return false; + } clearAttributes(); - final char[] buffer; - switch (valSize) { - case 64: - buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); - termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - break; - - case 32: - buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); - termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - break; - - default: - // should not happen - throw new IllegalArgumentException("valSize must be 32 or 64"); - } - + // the TermToBytesRefAttribute is directly accessing shift & value. typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); - shift += precisionStep; return true; } @@ -238,12 +309,11 @@ } // members - private final TermAttribute termAtt = addAttribute(TermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private int shift = 0, valSize = 0; // valSize==0 means not initialized + int shift, valSize = 0; // valSize==0 means not initialized private final int precisionStep; - private long value = 0L; + long value = 0L; } Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 931099) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -64,14 +64,14 @@ implementing the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of the constructors that starts with null text. To load - the token from a char[] use {@link #setTermBuffer(char[], int, int)}. - To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}. - Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + the token from a char[] use {@link #copyBuffer(char[], int, int)}. + To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. + Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, if you know that your text is shorter than the capacity of the termBuffer - or {@link #resizeTermBuffer(int)}, if there is any possibility + or {@link #resizeBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, - or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to set the length of the term text. See LUCENE-969 for details.

@@ -100,7 +100,7 @@
  • Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
    -    return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
    +    return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
       
  • @@ -115,6 +115,7 @@ @see org.apache.lucene.index.Payload */ +// TODO: change superclass to CharTermAttribute in 4.0! public class Token extends TermAttributeImpl implements TypeAttribute, PositionIncrementAttribute, FlagsAttribute, OffsetAttribute, PayloadAttribute { @@ -172,7 +173,7 @@ * @param end end offset */ public Token(String text, int start, int end) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; } @@ -187,7 +188,7 @@ * @param typ token type */ public Token(String text, int start, int end, String typ) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; type = typ; @@ -204,7 +205,7 @@ * @param flags token type bits */ public Token(String text, int start, int end, int flags) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; this.flags = flags; @@ -221,7 +222,7 @@ * @param end */ public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { - setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); + copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); startOffset = start; endOffset = end; } @@ -270,7 +271,7 @@ corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be - equal to {@link #termLength}, as the term text may have been altered by a + equal to {@link #length}, as the term text may have been altered by a stemmer or some other filter. */ public final int startOffset() { return startOffset; @@ -351,7 +352,7 @@ @Override public String toString() { final StringBuilder sb = new StringBuilder(); - sb.append('(').append(term()).append(',') + sb.append('(').append(super.toString()).append(',') .append(startOffset).append(',').append(endOffset); if (!"word".equals(type)) sb.append(",type=").append(type); @@ -387,7 +388,7 @@ /** Makes a clone, but replaces the term buffer & * start/end offset in the process. This is more * efficient than doing a full clone (and then calling - * setTermBuffer) because it saves a wasted copy of the old + * {@link #copyBuffer}) because it saves a wasted copy of the old * termBuffer. */ public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); @@ -442,16 +443,16 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset}, * {@link #setType} * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { clearNoTermBuffer(); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); payload = null; positionIncrement = 1; - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -459,14 +460,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { clearNoTermBuffer(); - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -474,14 +475,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -489,14 +490,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -504,14 +505,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -519,14 +520,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -538,7 +539,7 @@ * @param prototype */ public void reinit(Token prototype) { - setTermBuffer(prototype.termBuffer(), 0, prototype.termLength()); + copyBuffer(prototype.buffer(), 0, prototype.length()); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -553,7 +554,7 @@ * @param newTerm */ public void reinit(Token prototype, String newTerm) { - setTermBuffer(newTerm); + setEmpty().append(newTerm); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -570,7 +571,7 @@ * @param length */ public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { - setTermBuffer(newTermBuffer, offset, length); + copyBuffer(newTermBuffer, offset, length); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; Index: src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 931099) +++ src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (working copy) @@ -21,7 +21,9 @@ /** * The term text of a Token. + * @deprecated Use {@link CharTermAttribute} instead. */ +@Deprecated public interface TermAttribute extends Attribute { /** Returns the Token's term text. * Index: src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java (revision 931099) +++ src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java (working copy) @@ -17,211 +17,11 @@ * limitations under the License. */ -import java.io.Serializable; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.RamUsageEstimator; - /** * The term text of a Token. + * @deprecated This class is only available for AttributeSource + * to be able to load an old TermAttribute implementation class. */ -public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable { - private static int MIN_BUFFER_SIZE = 10; - - private char[] termBuffer; - private int termLength; - - /** Returns the Token's term text. - * - * This method has a performance penalty - * because the text is stored internally in a char[]. If - * possible, use {@link #termBuffer()} and {@link - * #termLength()} directly instead. If you really need a - * String, use this method, which is nothing more than - * a convenience call to new String(token.termBuffer(), 0, token.termLength()) - */ - public String term() { - initTermBuffer(); - return new String(termBuffer, 0, termLength); - } - - /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public void setTermBuffer(char[] buffer, int offset, int length) { - growTermBuffer(length); - System.arraycopy(buffer, offset, termBuffer, 0, length); - termLength = length; - } - - /** Copies the contents of buffer into the termBuffer array. - * @param buffer the buffer to copy - */ - public void setTermBuffer(String buffer) { - int length = buffer.length(); - growTermBuffer(length); - buffer.getChars(0, length, termBuffer, 0); - termLength = length; - } - - /** Copies the contents of buffer, starting at offset and continuing - * for length characters, into the termBuffer array. - * @param buffer the buffer to copy - * @param offset the index in the buffer of the first character to copy - * @param length the number of characters to copy - */ - public void setTermBuffer(String buffer, int offset, int length) { - assert offset <= buffer.length(); - assert offset + length <= buffer.length(); - growTermBuffer(length); - buffer.getChars(offset, offset + length, termBuffer, 0); - termLength = length; - } - - /** Returns the internal termBuffer character array which - * you can then directly alter. If the array is too - * small for your token, use {@link - * #resizeTermBuffer(int)} to increase it. After - * altering the buffer be sure to call {@link - * #setTermLength} to record the number of valid - * characters that were placed into the termBuffer. */ - public char[] termBuffer() { - initTermBuffer(); - return termBuffer; - } - - /** Grows the termBuffer to at least size newSize, preserving the - * existing content. Note: If the next operation is to change - * the contents of the term buffer use - * {@link #setTermBuffer(char[], int, int)}, - * {@link #setTermBuffer(String)}, or - * {@link #setTermBuffer(String, int, int)} - * to optimally combine the resize with the setting of the termBuffer. - * @param newSize minimum size of the new termBuffer - * @return newly created termBuffer with length >= newSize - */ - public char[] resizeTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation and preserve content - final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); - termBuffer = newCharBuffer; - } - } - return termBuffer; - } - - - /** Allocates a buffer char[] of at least newSize, without preserving the existing content. - * its always used in places that set the content - * @param newSize minimum size of the buffer - */ - private void growTermBuffer(int newSize) { - if (termBuffer == null) { - // The buffer is always at least MIN_BUFFER_SIZE - termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } else { - if(termBuffer.length < newSize){ - // Not big enough; create a new array with slight - // over allocation: - termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; - } - } - } - - private void initTermBuffer() { - if (termBuffer == null) { - termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)]; - termLength = 0; - } - } - - /** Return number of valid characters (length of the term) - * in the termBuffer array. */ - public int termLength() { - return termLength; - } - - /** Set number of valid characters (length of the term) in - * the termBuffer array. Use this to truncate the termBuffer - * or to synchronize with external manipulation of the termBuffer. - * Note: to grow the size of the array, - * use {@link #resizeTermBuffer(int)} first. - * @param length the truncated length - */ - public void setTermLength(int length) { - initTermBuffer(); - if (length > termBuffer.length) - throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); - termLength = length; - } - - @Override - public int hashCode() { - initTermBuffer(); - int code = termLength; - code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); - return code; - } - - @Override - public void clear() { - termLength = 0; - } - - @Override - public Object clone() { - TermAttributeImpl t = (TermAttributeImpl)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = termBuffer.clone(); - } - return t; - } - - @Override - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof TermAttributeImpl) { - initTermBuffer(); - TermAttributeImpl o = ((TermAttributeImpl) other); - o.initTermBuffer(); - - if (termLength != o.termLength) - return false; - for(int i=0;i> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert slice != null; @@ -48,6 +50,7 @@ } /** Write byte into byte slice stream */ + @Override public void writeByte(byte b) { assert slice != null; if (slice[upto] != 0) { @@ -60,6 +63,7 @@ assert upto != slice.length; } + @Override public void writeBytes(final byte[] b, int offset, final int len) { final int offsetEnd = offset + len; while(offset < offsetEnd) { @@ -78,12 +82,4 @@ public int getAddress() { return upto + (offset0 & DocumentsWriter.BYTE_BLOCK_NOT_MASK); } - - public void writeVInt(int i) { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte) i); - } -} +} \ No newline at end of file Index: src/java/org/apache/lucene/index/CharBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/CharBlockPool.java (revision 931099) +++ src/java/org/apache/lucene/index/CharBlockPool.java (working copy) @@ -1,60 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; -import org.apache.lucene.util.ArrayUtil; - -final class CharBlockPool { - - public char[][] buffers = new char[10][]; - int numBuffer; - - int bufferUpto = -1; // Which buffer we are upto - public int charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer - - public char[] buffer; // Current head buffer - public int charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset - final private DocumentsWriter docWriter; - - public CharBlockPool(DocumentsWriter docWriter) { - this.docWriter = docWriter; - } - - public void reset() { - docWriter.recycleCharBlocks(buffers, 1+bufferUpto); - bufferUpto = -1; - charUpto = DocumentsWriter.CHAR_BLOCK_SIZE; - charOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; - } - - public void nextBuffer() { - if (1+bufferUpto == buffers.length) { - char[][] newBuffers = new char[ArrayUtil.oversize(buffers.length+1, - NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); - buffers = newBuffers; - } - buffer = buffers[1+bufferUpto] = docWriter.getCharBlock(); - bufferUpto++; - - charUpto = 0; - charOffset += DocumentsWriter.CHAR_BLOCK_SIZE; - } -} - Index: src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- src/java/org/apache/lucene/index/CheckIndex.java (revision 931099) +++ src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -22,6 +22,9 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.document.AbstractField; // for javadocs import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import java.text.NumberFormat; import java.io.PrintStream; @@ -122,6 +125,9 @@ /** Name of the segment. */ public String name; + /** Name of codec used to read this segment. */ + public String codec; + /** Document count (does not take deletions into account). */ public int docCount; @@ -263,26 +269,6 @@ infoStream.println(msg); } - private static class MySegmentTermDocs extends SegmentTermDocs { - - int delCount; - - MySegmentTermDocs(SegmentReader p) { - super(p); - } - - @Override - public void seek(Term term) throws IOException { - super.seek(term); - delCount = 0; - } - - @Override - protected void skippingDoc() throws IOException { - delCount++; - } - } - /** Returns a {@link Status} instance detailing * the state of the index. * @@ -296,6 +282,10 @@ return checkIndex(null); } + protected Status checkIndex(List onlySegments) throws IOException { + return checkIndex(onlySegments, CodecProvider.getDefault()); + } + /** Returns a {@link Status} instance detailing * the state of the index. * @@ -308,13 +298,13 @@ *

    WARNING: make sure * you only call this when the index is not opened by any * writer. */ - public Status checkIndex(List onlySegments) throws IOException { + protected Status checkIndex(List onlySegments, CodecProvider codecs) throws IOException { NumberFormat nf = NumberFormat.getInstance(); SegmentInfos sis = new SegmentInfos(); Status result = new Status(); result.dir = dir; try { - sis.read(dir); + sis.read(dir, codecs); } catch (Throwable t) { msg("ERROR: could not read any segments file in directory"); result.missingSegments = true; @@ -371,6 +361,8 @@ sFormat = "FORMAT_USER_DATA [Lucene 2.9]"; else if (format == SegmentInfos.FORMAT_DIAGNOSTICS) sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]"; + else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS) + sFormat = "FORMAT_FLEX_POSTINGS [Lucene 3.1]"; else if (format < SegmentInfos.CURRENT_FORMAT) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; @@ -429,6 +421,9 @@ SegmentReader reader = null; try { + final String codec = info.getCodec().name; + msg(" codec=" + codec); + segInfoStat.codec = codec; msg(" compound=" + info.getUseCompoundFile()); segInfoStat.compound = info.getUseCompoundFile(); msg(" hasProx=" + info.getHasProx()); @@ -452,6 +447,7 @@ msg(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile()); segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile(); } + final String delFileName = info.getDelFileName(); if (delFileName == null){ msg(" no deletions"); @@ -503,7 +499,7 @@ segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader); // Test the Term Index - segInfoStat.termIndexStatus = testTermIndex(info, reader); + segInfoStat.termIndexStatus = testTermIndex(reader); // Test Stored Fields segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf); @@ -586,69 +582,129 @@ /** * Test the term index. */ - private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) { + private Status.TermIndexStatus testTermIndex(SegmentReader reader) { final Status.TermIndexStatus status = new Status.TermIndexStatus(); + final int maxDoc = reader.maxDoc(); + final Bits delDocs = reader.getDeletedDocs(); + try { + if (infoStream != null) { infoStream.print(" test: terms, freq, prox..."); } - final TermEnum termEnum = reader.terms(); - final TermPositions termPositions = reader.termPositions(); + final Fields fields = reader.fields(); + if (fields == null) { + msg("OK [no fields/terms]"); + return status; + } + + final FieldsEnum fieldsEnum = fields.iterator(); + while(true) { + final String field = fieldsEnum.next(); + if (field == null) { + break; + } + + final TermsEnum terms = fieldsEnum.terms(); - // Used only to count up # deleted docs for this term - final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); + DocsEnum docs = null; + DocsAndPositionsEnum postings = null; - final int maxDoc = reader.maxDoc(); + boolean hasOrd = true; + final long termCountStart = status.termCount; - while (termEnum.next()) { - status.termCount++; - final Term term = termEnum.term(); - final int docFreq = termEnum.docFreq(); - termPositions.seek(term); - int lastDoc = -1; - int freq0 = 0; - status.totFreq += docFreq; - while (termPositions.next()) { - freq0++; - final int doc = termPositions.doc(); - final int freq = termPositions.freq(); - if (doc <= lastDoc) - throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); - if (doc >= maxDoc) - throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); + while(true) { - lastDoc = doc; - if (freq <= 0) - throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); + final BytesRef term = terms.next(); + if (term == null) { + break; + } + + final int docFreq = terms.docFreq(); + status.totFreq += docFreq; + + docs = terms.docs(delDocs, docs); + postings = terms.docsAndPositions(delDocs, postings); + + if (hasOrd) { + long ord = -1; + try { + ord = terms.ord(); + } catch (UnsupportedOperationException uoe) { + hasOrd = false; + } + + if (hasOrd) { + final long ordExpected = status.termCount - termCountStart; + if (ord != ordExpected) { + throw new RuntimeException("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected); + } + } + } + + status.termCount++; + + final DocsEnum docs2; + if (postings != null) { + docs2 = postings; + } else { + docs2 = docs; + } + + int lastDoc = -1; + while(true) { + final int doc = docs2.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + final int freq = docs2.freq(); + status.totPos += freq; + + if (doc <= lastDoc) { + throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); + } + if (doc >= maxDoc) { + throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); + } + + lastDoc = doc; + if (freq <= 0) { + throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); + } - int lastPos = -1; - status.totPos += freq; - for(int j=0;j>>= 1; - } else { - delta = skipStream.readVInt(); - } - freqPointer[level] += skipStream.readVInt(); - proxPointer[level] += skipStream.readVInt(); - - return delta; - } -} Index: src/java/org/apache/lucene/index/DefaultSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/DefaultSkipListWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/DefaultSkipListWriter.java (working copy) @@ -1,136 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.store.IndexOutput; - - -/** - * Implements the skip list writer for the default posting list format - * that stores positions and payloads. - * - */ -class DefaultSkipListWriter extends MultiLevelSkipListWriter { - private int[] lastSkipDoc; - private int[] lastSkipPayloadLength; - private long[] lastSkipFreqPointer; - private long[] lastSkipProxPointer; - - private IndexOutput freqOutput; - private IndexOutput proxOutput; - - private int curDoc; - private boolean curStorePayloads; - private int curPayloadLength; - private long curFreqPointer; - private long curProxPointer; - - DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { - super(skipInterval, numberOfSkipLevels, docCount); - this.freqOutput = freqOutput; - this.proxOutput = proxOutput; - - lastSkipDoc = new int[numberOfSkipLevels]; - lastSkipPayloadLength = new int[numberOfSkipLevels]; - lastSkipFreqPointer = new long[numberOfSkipLevels]; - lastSkipProxPointer = new long[numberOfSkipLevels]; - } - - void setFreqOutput(IndexOutput freqOutput) { - this.freqOutput = freqOutput; - } - - void setProxOutput(IndexOutput proxOutput) { - this.proxOutput = proxOutput; - } - - /** - * Sets the values for the current skip data. - */ - void setSkipData(int doc, boolean storePayloads, int payloadLength) { - this.curDoc = doc; - this.curStorePayloads = storePayloads; - this.curPayloadLength = payloadLength; - this.curFreqPointer = freqOutput.getFilePointer(); - if (proxOutput != null) - this.curProxPointer = proxOutput.getFilePointer(); - } - - @Override - protected void resetSkip() { - super.resetSkip(); - Arrays.fill(lastSkipDoc, 0); - Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list - Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); - if (proxOutput != null) - Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); - } - - @Override - protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { - // To efficiently store payloads in the posting lists we do not store the length of - // every payload. Instead we omit the length for a payload if the previous payload had - // the same length. - // However, in order to support skipping the payload length at every skip point must be known. - // So we use the same length encoding that we use for the posting lists for the skip data as well: - // Case 1: current field does not store payloads - // SkipDatum --> DocSkip, FreqSkip, ProxSkip - // DocSkip,FreqSkip,ProxSkip --> VInt - // DocSkip records the document number before every SkipInterval th document in TermFreqs. - // Document numbers are represented as differences from the previous value in the sequence. - // Case 2: current field stores payloads - // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip - // DocSkip,FreqSkip,ProxSkip --> VInt - // PayloadLength --> VInt - // In this case DocSkip/2 is the difference between - // the current and the previous value. If DocSkip - // is odd, then a PayloadLength encoded as VInt follows, - // if DocSkip is even, then it is assumed that the - // current payload length equals the length at the previous - // skip point - if (curStorePayloads) { - int delta = curDoc - lastSkipDoc[level]; - if (curPayloadLength == lastSkipPayloadLength[level]) { - // the current payload length equals the length at the previous skip point, - // so we don't store the length again - skipBuffer.writeVInt(delta * 2); - } else { - // the payload length is different from the previous one. We shift the DocSkip, - // set the lowest bit and store the current payload length as VInt. - skipBuffer.writeVInt(delta * 2 + 1); - skipBuffer.writeVInt(curPayloadLength); - lastSkipPayloadLength[level] = curPayloadLength; - } - } else { - // current field does not store payloads - skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); - } - skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); - skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); - - lastSkipDoc[level] = curDoc; - //System.out.println("write doc at level " + level + ": " + curDoc); - - lastSkipFreqPointer[level] = curFreqPointer; - lastSkipProxPointer[level] = curProxPointer; - } - -} Index: src/java/org/apache/lucene/index/DirectoryReader.java =================================================================== --- src/java/org/apache/lucene/index/DirectoryReader.java (revision 931099) +++ src/java/org/apache/lucene/index/DirectoryReader.java (working copy) @@ -25,7 +25,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; - +import java.util.List; import java.util.Map; import java.util.Set; @@ -35,6 +35,11 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.BytesRef; + import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close /** @@ -43,13 +48,14 @@ class DirectoryReader extends IndexReader implements Cloneable { protected Directory directory; protected boolean readOnly; + + protected CodecProvider codecs; IndexWriter writer; private IndexDeletionPolicy deletionPolicy; private Lock writeLock; private SegmentInfos segmentInfos; - private SegmentInfos segmentInfosStart; private boolean stale; private final int termInfosIndexDivisor; @@ -58,34 +64,57 @@ private SegmentReader[] subReaders; private int[] starts; // 1st docno for each segment + private final Map subReaderToSlice = new HashMap(); private Map normsCache = new HashMap(); private int maxDoc = 0; private int numDocs = -1; private boolean hasDeletions = false; +// static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, +// final int termInfosIndexDivisor) throws CorruptIndexException, IOException { +// return open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor, null); +// } + static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, - final int termInfosIndexDivisor) throws CorruptIndexException, IOException { + final int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + final CodecProvider codecs2; + if (codecs == null) { + codecs2 = CodecProvider.getDefault(); + } else { + codecs2 = codecs; + } return (IndexReader) new SegmentInfos.FindSegmentsFile(directory) { @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs2); if (readOnly) - return new ReadOnlyDirectoryReader(directory, infos, deletionPolicy, termInfosIndexDivisor); + return new ReadOnlyDirectoryReader(directory, infos, deletionPolicy, termInfosIndexDivisor, codecs2); else - return new DirectoryReader(directory, infos, deletionPolicy, false, termInfosIndexDivisor); + return new DirectoryReader(directory, infos, deletionPolicy, false, termInfosIndexDivisor, codecs2); } }.run(commit); } /** Construct reading the named set of readers. */ - DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws IOException { +// DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws IOException { +// this(directory, sis, deletionPolicy, readOnly, termInfosIndexDivisor, null); +// } + + /** Construct reading the named set of readers. */ + DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = sis; this.deletionPolicy = deletionPolicy; this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } + // To reduce the chance of hitting FileNotFound // (and having to retry), we open segments in // reverse because IndexWriter merges & deletes @@ -115,12 +144,16 @@ } // Used by near real-time search - DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor) throws IOException { + DirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = writer.getDirectory(); this.readOnly = true; segmentInfos = infos; - segmentInfosStart = (SegmentInfos) infos.clone(); this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } // IndexWriter synchronizes externally before calling // us, which ensures infos will not change; so there's @@ -166,11 +199,17 @@ /** This constructor is only used for {@link #reopen()} */ DirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, - Map oldNormsCache, boolean readOnly, boolean doClone, int termInfosIndexDivisor) throws IOException { + Map oldNormsCache, boolean readOnly, boolean doClone, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = infos; this.termInfosIndexDivisor = termInfosIndexDivisor; + if (codecs == null) { + this.codecs = CodecProvider.getDefault(); + } else { + this.codecs = codecs; + } + // we put the old SegmentReaders in a map, that allows us // to lookup a reader using its segment name @@ -296,25 +335,45 @@ buffer.append(' '); } buffer.append(subReaders[i]); + buffer.append(' '); } buffer.append(')'); return buffer.toString(); } - private void initialize(SegmentReader[] subReaders) { + private void initialize(SegmentReader[] subReaders) throws IOException { this.subReaders = subReaders; starts = new int[subReaders.length + 1]; // build starts array + + final List subFields = new ArrayList(); + final List fieldSlices = new ArrayList(); + for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs - if (subReaders[i].hasDeletions()) + if (subReaders[i].hasDeletions()) { hasDeletions = true; + } + + final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i], subReaders[i].maxDoc(), i); + subReaderToSlice.put(subReaders[i], slice); + + final Fields f = subReaders[i].fields(); + if (f != null) { + subFields.add(f); + fieldSlices.add(slice); + } } starts[subReaders.length] = maxDoc; } @Override + public Bits getDeletedDocs() { + throw new UnsupportedOperationException("please use MultiFields.getDeletedDocs if you really need a top level Bits deletedDocs (NOTE that it's usually better to work per segment instead)"); + } + + @Override public final synchronized Object clone() { try { return clone(readOnly); // Preserve current readOnly @@ -435,7 +494,7 @@ @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs); return doReopen(infos, false, openReadOnly); } }.run(commit); @@ -444,9 +503,9 @@ private synchronized DirectoryReader doReopen(SegmentInfos infos, boolean doClone, boolean openReadOnly) throws CorruptIndexException, IOException { DirectoryReader reader; if (openReadOnly) { - reader = new ReadOnlyDirectoryReader(directory, infos, subReaders, starts, normsCache, doClone, termInfosIndexDivisor); + reader = new ReadOnlyDirectoryReader(directory, infos, subReaders, starts, normsCache, doClone, termInfosIndexDivisor, null); } else { - reader = new DirectoryReader(directory, infos, subReaders, starts, normsCache, false, doClone, termInfosIndexDivisor); + reader = new DirectoryReader(directory, infos, subReaders, starts, normsCache, false, doClone, termInfosIndexDivisor, null); } return reader; } @@ -640,7 +699,7 @@ // Optimize single segment case: return subReaders[0].terms(); } else { - return new MultiTermEnum(this, subReaders, starts, null); + return new MultiTermEnum(this, subReaders, starts, null); } } @@ -665,6 +724,16 @@ } @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + int total = 0; // sum freqs in segments + for (int i = 0; i < subReaders.length; i++) { + total += subReaders[i].docFreq(field, term); + } + return total; + } + + @Override public TermDocs termDocs() throws IOException { ensureOpen(); if (subReaders.length == 1) { @@ -687,6 +756,11 @@ } @Override + public Fields fields() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getFields if you really need a top level Fields (NOTE that it's usually better to work per segment instead)"); + } + + @Override public TermPositions termPositions() throws IOException { ensureOpen(); if (subReaders.length == 1) { @@ -731,7 +805,7 @@ // we have to check whether index has changed since this reader was opened. // if so, this reader is no longer valid for deletion - if (SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) { + if (SegmentInfos.readCurrentVersion(directory, codecs) > segmentInfos.getVersion()) { stale = true; this.writeLock.release(); this.writeLock = null; @@ -751,13 +825,18 @@ */ @Override protected void doCommit(Map commitUserData) throws IOException { + // poll subreaders for changes + for (int i = 0; !hasChanges && i < subReaders.length; i++) { + hasChanges |= subReaders[i].hasChanges; + } + if (hasChanges) { segmentInfos.setUserData(commitUserData); // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: IndexFileDeleter deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, null, null); + segmentInfos, null, null, codecs); // Checkpoint the state we are about to change, in // case we have to roll back: @@ -828,20 +907,30 @@ } @Override + public long getUniqueTermCount() throws IOException { + throw new UnsupportedOperationException(""); + } + + @Override public Map getCommitUserData() { ensureOpen(); return segmentInfos.getUserData(); } + /** + * Check whether this IndexReader is still using the current (i.e., most recently committed) version of the index. If + * a writer has committed any changes to the index since this reader was opened, this will return false, + * in which case you must open a new IndexReader in order + * to see the changes. Use {@link IndexWriter#commit} to + * commit changes to the index. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ @Override public boolean isCurrent() throws CorruptIndexException, IOException { ensureOpen(); - if (writer == null || writer.isClosed()) { - // we loaded SegmentInfos from the directory - return SegmentInfos.readCurrentVersion(directory) == segmentInfos.getVersion(); - } else { - return writer.nrtIsCurrent(segmentInfosStart); - } + return SegmentInfos.readCurrentVersion(directory, codecs) == segmentInfos.getVersion(); } @Override @@ -893,6 +982,11 @@ return subReaders; } + @Override + public int getSubReaderDocBase(IndexReader subReader) { + return subReaderToSlice.get(subReader).start; + } + /** Returns the directory this index resides in. */ @Override public Directory directory() { @@ -919,12 +1013,17 @@ /** @see org.apache.lucene.index.IndexReader#listCommits */ public static Collection listCommits(Directory dir) throws IOException { + return listCommits(dir, CodecProvider.getDefault()); + } + + /** @see org.apache.lucene.index.IndexReader#listCommits */ + public static Collection listCommits(Directory dir, CodecProvider codecs) throws IOException { final String[] files = dir.listAll(); Collection commits = new ArrayList(); SegmentInfos latest = new SegmentInfos(); - latest.read(dir); + latest.read(dir, codecs); final long currentGen = latest.getGeneration(); commits.add(new ReaderCommit(latest, dir)); @@ -941,7 +1040,7 @@ try { // IOException allowed to throw there, in case // segments_N is corrupt - sis.read(dir, fileName); + sis.read(dir, fileName, codecs); } catch (FileNotFoundException fnfe) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth @@ -1020,30 +1119,34 @@ return userData; } } - + + // @deprecated This is pre-flex API + // Exposes pre-flex API by doing on-the-fly merging + // pre-flex API to each segment static class MultiTermEnum extends TermEnum { IndexReader topReader; // used for matching TermEnum to TermDocs - private SegmentMergeQueue queue; + private LegacySegmentMergeQueue queue; private Term term; private int docFreq; - final SegmentMergeInfo[] matchingSegments; // null terminated array of matching segments + final LegacySegmentMergeInfo[] matchingSegments; // null terminated array of matching segments public MultiTermEnum(IndexReader topReader, IndexReader[] readers, int[] starts, Term t) throws IOException { this.topReader = topReader; - queue = new SegmentMergeQueue(readers.length); - matchingSegments = new SegmentMergeInfo[readers.length+1]; + queue = new LegacySegmentMergeQueue(readers.length); + matchingSegments = new LegacySegmentMergeInfo[readers.length+1]; for (int i = 0; i < readers.length; i++) { IndexReader reader = readers[i]; TermEnum termEnum; if (t != null) { termEnum = reader.terms(t); - } else + } else { termEnum = reader.terms(); + } - SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader); + LegacySegmentMergeInfo smi = new LegacySegmentMergeInfo(starts[i], termEnum, reader); smi.ord = i; if (t == null ? smi.next() : termEnum.term() != null) queue.add(smi); // initialize queue @@ -1059,7 +1162,7 @@ @Override public boolean next() throws IOException { for (int i=0; i 0) { - TermDocs docs = reader.termDocs(); try { + Fields fields = reader.fields(); + TermsEnum termsEnum = null; + + String currentField = null; + BytesRef termRef = new BytesRef(); + DocsEnum docs = null; + for (Entry entry: deletesFlushed.terms.entrySet()) { Term term = entry.getKey(); - // LUCENE-2086: we should be iterating a TreeMap, - // here, so terms better be in order: + // Since we visit terms sorted, we gain performance + // by re-using the same TermsEnum and seeking only + // forwards + if (term.field() != currentField) { + assert currentField == null || currentField.compareTo(term.field()) < 0; + currentField = term.field(); + Terms terms = fields.terms(currentField); + if (terms != null) { + termsEnum = terms.iterator(); + } else { + termsEnum = null; + } + } + + if (termsEnum == null) { + continue; + } assert checkDeleteTerm(term); - docs.seek(term); - int limit = entry.getValue().getNum(); - while (docs.next()) { - int docID = docs.doc(); - if (docIDStart+docID >= limit) - break; - reader.deleteDocument(docID); - any = true; + + termRef.copy(term.text()); + + if (termsEnum.seek(termRef, false) == TermsEnum.SeekStatus.FOUND) { + DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); + + if (docsEnum != null) { + docs = docsEnum; + int limit = entry.getValue().getNum(); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) { + break; + } + reader.deleteDocument(docID); + any = true; + } + } } } } finally { - docs.close(); + //docs.close(); } } - // Delete by docID for (Integer docIdInt : deletesFlushed.docIDs) { int docID = docIdInt.intValue(); @@ -1118,7 +1156,7 @@ } synchronized boolean doBalanceRAM() { - return ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger); + return ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize); } /** Does the synchronized work to finish/flush the @@ -1201,7 +1239,6 @@ return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; } - long numBytesAlloc; long numBytesUsed; NumberFormat nf = NumberFormat.getInstance(); @@ -1243,6 +1280,8 @@ final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK; + final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2; + private class ByteBlockAllocator extends ByteBlockPool.Allocator { final int blockSize; @@ -1259,19 +1298,16 @@ final int size = freeByteBlocks.size(); final byte[] b; if (0 == size) { + b = new byte[blockSize]; // Always record a block allocated, even if // trackAllocations is false. This is necessary // because this block will be shared between // things that don't track allocations (term // vectors) and things that do (freq/prox // postings). - numBytesAlloc += blockSize; - b = new byte[blockSize]; + numBytesUsed += blockSize; } else b = freeByteBlocks.remove(size-1); - if (trackAllocations) - numBytesUsed += blockSize; - assert numBytesUsed <= numBytesAlloc; return b; } } @@ -1291,7 +1327,7 @@ final int size = blocks.size(); for(int i=0;i freeCharBlocks = new ArrayList(); - - /* Allocate another char[] from the shared pool */ - synchronized char[] getCharBlock() { - final int size = freeCharBlocks.size(); - final char[] c; - if (0 == size) { - numBytesAlloc += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - c = new char[CHAR_BLOCK_SIZE]; - } else - c = freeCharBlocks.remove(size-1); - // We always track allocations of char blocks, for now, - // because nothing that skips allocation tracking - // (currently only term vectors) uses its own char - // blocks. - numBytesUsed += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - assert numBytesUsed <= numBytesAlloc; - return c; - } - - /* Return char[]s to the pool */ - synchronized void recycleCharBlocks(char[][] blocks, int numBlocks) { - for(int i=0;i freeTrigger) { + if (numBytesUsed+deletesRAMUsed > ramBufferSize) { if (infoStream != null) message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + - " vs trigger=" + toMB(flushTrigger) + - " allocMB=" + toMB(numBytesAlloc) + + " vs trigger=" + toMB(ramBufferSize) + " deletesMB=" + toMB(deletesRAMUsed) + - " vs trigger=" + toMB(freeTrigger) + " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + - " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE) + - " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE)); + " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE)); - final long startBytesAlloc = numBytesAlloc + deletesRAMUsed; + final long startBytesUsed = numBytesUsed + deletesRAMUsed; int iter = 0; @@ -1427,46 +1410,38 @@ boolean any = true; - while(numBytesAlloc+deletesRAMUsed > freeLevel) { + while(numBytesUsed+deletesRAMUsed > freeLevel) { synchronized(this) { - if (0 == perDocAllocator.freeByteBlocks.size() - && 0 == byteBlockAllocator.freeByteBlocks.size() - && 0 == freeCharBlocks.size() - && 0 == freeIntBlocks.size() - && !any) { + if (0 == perDocAllocator.freeByteBlocks.size() && + 0 == byteBlockAllocator.freeByteBlocks.size() && + 0 == freeIntBlocks.size() && !any) { // Nothing else to free -- must flush now. - bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger; + bufferIsFull = numBytesUsed+deletesRAMUsed > ramBufferSize; if (infoStream != null) { - if (numBytesUsed > flushTrigger) + if (numBytesUsed+deletesRAMUsed > ramBufferSize) message(" nothing to free; now set bufferIsFull"); else message(" nothing to free"); } - assert numBytesUsed <= numBytesAlloc; break; } - if ((0 == iter % 5) && byteBlockAllocator.freeByteBlocks.size() > 0) { + if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) { byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); - numBytesAlloc -= BYTE_BLOCK_SIZE; + numBytesUsed -= BYTE_BLOCK_SIZE; } - if ((1 == iter % 5) && freeCharBlocks.size() > 0) { - freeCharBlocks.remove(freeCharBlocks.size()-1); - numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; - } - - if ((2 == iter % 5) && freeIntBlocks.size() > 0) { + if ((1 == iter % 4) && freeIntBlocks.size() > 0) { freeIntBlocks.remove(freeIntBlocks.size()-1); - numBytesAlloc -= INT_BLOCK_SIZE * INT_NUM_BYTE; + numBytesUsed -= INT_BLOCK_SIZE * INT_NUM_BYTE; } - if ((3 == iter % 5) && perDocAllocator.freeByteBlocks.size() > 0) { + if ((2 == iter % 4) && perDocAllocator.freeByteBlocks.size() > 0) { // Remove upwards of 32 blocks (each block is 1K) for (int i = 0; i < 32; ++i) { perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1); - numBytesAlloc -= PER_DOC_BLOCK_SIZE; + numBytesUsed -= PER_DOC_BLOCK_SIZE; if (perDocAllocator.freeByteBlocks.size() == 0) { break; } @@ -1474,7 +1449,7 @@ } } - if ((4 == iter % 5) && any) + if ((3 == iter % 4) && any) // Ask consumer to free any recycled state any = consumer.freeRAM(); @@ -1482,26 +1457,7 @@ } if (infoStream != null) - message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); - - } else { - // If we have not crossed the 100% mark, but have - // crossed the 95% mark of RAM we are actually - // using, go ahead and flush. This prevents - // over-allocating and then freeing, with every - // flush. - synchronized(this) { - - if (numBytesUsed+deletesRAMUsed > flushTrigger) { - if (infoStream != null) - message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + - " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + - " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) + - " triggerMB=" + nf.format(flushTrigger/1024./1024.)); - - bufferIsFull = true; - } - } + message(" after free: freedMB=" + nf.format((startBytesUsed-numBytesUsed-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.)); } } Index: src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- src/java/org/apache/lucene/index/FieldInfo.java (revision 931099) +++ src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -17,21 +17,22 @@ * limitations under the License. */ -final class FieldInfo { - String name; - boolean isIndexed; - int number; +/** @lucene.experimental */ +public final class FieldInfo { + public String name; + public boolean isIndexed; + public int number; // true if term vector for this field should be stored boolean storeTermVector; boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; - boolean omitNorms; // omit norms associated with indexed fields - boolean omitTermFreqAndPositions; - - boolean storePayloads; // whether this field stores payloads together with term positions + public boolean omitNorms; // omit norms associated with indexed fields + public boolean omitTermFreqAndPositions; + public boolean storePayloads; // whether this field stores payloads together with term positions + FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { Index: src/java/org/apache/lucene/index/FieldInfos.java =================================================================== --- src/java/org/apache/lucene/index/FieldInfos.java (revision 931099) +++ src/java/org/apache/lucene/index/FieldInfos.java (working copy) @@ -32,8 +32,9 @@ * of this class are thread-safe for multiple readers, but only one thread can * be adding documents at a time, with no other reader or writer threads * accessing this object. + * @lucene.experimental */ -final class FieldInfos { +public final class FieldInfos { // Used internally (ie not written to *.fnm files) for pre-2.9 files public static final int FORMAT_PRE = -1; @@ -120,7 +121,7 @@ } /** Returns true if any fields do not omitTermFreqAndPositions */ - boolean hasProx() { + public boolean hasProx() { final int numFields = byNumber.size(); for(int i=0;i 0 && delta <= 0)) - throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); - - if ((++df % skipInterval) == 0) { - // TODO: abstraction violation - skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); - skipListWriter.bufferSkip(df); - } - - assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; - - lastDocID = docID; - if (omitTermFreqAndPositions) - out.writeVInt(delta); - else if (1 == termDocFreq) - out.writeVInt((delta<<1) | 1); - else { - out.writeVInt(delta<<1); - out.writeVInt(termDocFreq); - } - - return posWriter; - } - - private final TermInfo termInfo = new TermInfo(); // minimize consing - final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); - - /** Called when we are done adding docs to this term */ - @Override - void finish() throws IOException { - long skipPointer = skipListWriter.writeSkip(out); - - // TODO: this is abstraction violation -- we should not - // peek up into parents terms encoding format - termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); - - if (df > 0) { - parent.termsOut.add(fieldInfo.number, - utf8.result, - utf8.length, - termInfo); - } - - lastDocID = 0; - df = 0; - } - - void close() throws IOException { - out.close(); - posWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** Abstract API that consumes terms, doc, freq, prox and - * payloads postings. Concrete implementations of this - * actually do "something" with the postings (write it into - * the index in a specific format). - * - * @lucene.experimental - */ -abstract class FormatPostingsFieldsConsumer { - - /** Add a new field */ - abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; - - /** Called when we are done adding everything. */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (working copy) @@ -1,75 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { - - final Directory dir; - final String segment; - final TermInfosWriter termsOut; - final FieldInfos fieldInfos; - final FormatPostingsTermsWriter termsWriter; - final DefaultSkipListWriter skipListWriter; - final int totalNumDocs; - - public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { - super(); - - dir = state.directory; - segment = state.segmentName; - totalNumDocs = state.numDocs; - this.fieldInfos = fieldInfos; - termsOut = new TermInfosWriter(dir, - segment, - fieldInfos, - state.termIndexInterval); - - // TODO: this is a nasty abstraction violation (that we - // peek down to find freqOut/proxOut) -- we need a - // better abstraction here whereby these child consumers - // can provide skip data or not - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - totalNumDocs, - null, - null); - - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); - - termsWriter = new FormatPostingsTermsWriter(state, this); - } - - /** Add a new field */ - @Override - FormatPostingsTermsConsumer addField(FieldInfo field) { - termsWriter.setField(field); - return termsWriter; - } - - /** Called when we are done adding everything. */ - @Override - void finish() throws IOException { - termsOut.close(); - termsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (working copy) @@ -1,31 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - - -abstract class FormatPostingsPositionsConsumer { - - /** Add a new position & payload. If payloadLength > 0 - * you must read those bytes from the IndexInput. */ - abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; - - /** Called when we are done adding positions & payloads */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (working copy) @@ -1,89 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexOutput; - - -import java.io.IOException; - -final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer { - - final FormatPostingsDocsWriter parent; - final IndexOutput out; - - boolean omitTermFreqAndPositions; - boolean storePayloads; - int lastPayloadLength = -1; - - FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException { - this.parent = parent; - omitTermFreqAndPositions = parent.omitTermFreqAndPositions; - if (parent.parent.parent.fieldInfos.hasProx()) { - // At least one field does not omit TF, so create the - // prox file - final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION); - state.flushedFiles.add(fileName); - out = parent.parent.parent.dir.createOutput(fileName); - parent.skipListWriter.setProxOutput(out); - } else - // Every field omits TF so we will write no prox file - out = null; - } - - int lastPosition; - - /** Add a new position & payload */ - @Override - void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException { - assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; - assert out != null; - - final int delta = position - lastPosition; - lastPosition = position; - - if (storePayloads) { - if (payloadLength != lastPayloadLength) { - lastPayloadLength = payloadLength; - out.writeVInt((delta<<1)|1); - out.writeVInt(payloadLength); - } else - out.writeVInt(delta << 1); - if (payloadLength > 0) - out.writeBytes(payload, payloadLength); - } else - out.writeVInt(delta); - } - - void setField(FieldInfo fieldInfo) { - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; - storePayloads = omitTermFreqAndPositions ? false : fieldInfo.storePayloads; - } - - /** Called when we are done adding positions & payloads */ - @Override - void finish() { - lastPosition = 0; - lastPayloadLength = -1; - } - - void close() throws IOException { - if (out != null) - out.close(); - } -} Index: src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (working copy) @@ -1,47 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - -/** - * @lucene.experimental - */ - -abstract class FormatPostingsTermsConsumer { - - /** Adds a new term in this field; term ends with U+FFFF - * char */ - abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException; - - char[] termBuffer; - FormatPostingsDocsConsumer addTerm(String text) throws IOException { - final int len = text.length(); - if (termBuffer == null || termBuffer.length < 1+len) - termBuffer = new char[ArrayUtil.oversize(1+len, RamUsageEstimator.NUM_BYTES_CHAR)]; - text.getChars(0, len, termBuffer, 0); - termBuffer[len] = 0xffff; - return addTerm(termBuffer, 0); - } - - /** Called when we are done adding terms to this field */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { - - final FormatPostingsFieldsWriter parent; - final FormatPostingsDocsWriter docsWriter; - final TermInfosWriter termsOut; - FieldInfo fieldInfo; - - FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { - super(); - this.parent = parent; - termsOut = parent.termsOut; - docsWriter = new FormatPostingsDocsWriter(state, this); - } - - void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - docsWriter.setField(fieldInfo); - } - - char[] currentTerm; - int currentTermStart; - - long freqStart; - long proxStart; - - /** Adds a new term in this field */ - @Override - FormatPostingsDocsConsumer addTerm(char[] text, int start) { - currentTerm = text; - currentTermStart = start; - - // TODO: this is abstraction violation -- ideally this - // terms writer is not so "invasive", looking for file - // pointers in its child consumers. - freqStart = docsWriter.out.getFilePointer(); - if (docsWriter.posWriter.out != null) - proxStart = docsWriter.posWriter.out.getFilePointer(); - - parent.skipListWriter.resetSkip(); - - return docsWriter; - } - - /** Called when we are done adding terms to this field */ - @Override - void finish() { - } - - void close() throws IOException { - docsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FreqProxFieldMergeState.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (revision 931099) +++ src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (working copy) @@ -18,6 +18,8 @@ */ import java.io.IOException; +import java.util.Comparator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray; @@ -31,13 +33,12 @@ final FreqProxTermsWriterPerField field; final int numPostings; - final CharBlockPool charPool; + private final ByteBlockPool bytePool; final int[] termIDs; final FreqProxPostingsArray postings; int currentTermID; - char[] text; - int textOffset; + final BytesRef text = new BytesRef(); private int postingUpto = -1; @@ -47,29 +48,31 @@ int docID; int termFreq; - public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) { + public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, Comparator termComp) { this.field = field; - this.charPool = field.perThread.termsHashPerThread.charPool; this.numPostings = field.termsHashPerField.numPostings; - this.termIDs = field.termsHashPerField.sortPostings(); + this.bytePool = field.perThread.termsHashPerThread.bytePool; + this.termIDs = field.termsHashPerField.sortPostings(termComp); this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray; } boolean nextTerm() throws IOException { postingUpto++; - if (postingUpto == numPostings) + if (postingUpto == numPostings) { return false; + } currentTermID = termIDs[postingUpto]; docID = 0; + // Get BytesRef final int textStart = postings.textStarts[currentTermID]; - text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - textOffset = textStart & DocumentsWriter.CHAR_BLOCK_MASK; + bytePool.setBytesRef(text, textStart); field.termsHashPerField.initReader(freq, currentTermID, 0); - if (!field.fieldInfo.omitTermFreqAndPositions) + if (!field.fieldInfo.omitTermFreqAndPositions) { field.termsHashPerField.initReader(prox, currentTermID, 1); + } // Should always be true boolean result = nextDoc(); Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -17,15 +17,20 @@ * limitations under the License. */ -import org.apache.lucene.util.UnicodeUtil; - import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Iterator; +import java.util.List; import java.util.Map; -import java.util.ArrayList; -import java.util.List; +import java.util.Comparator; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.util.BytesRef; + final class FreqProxTermsWriter extends TermsHashConsumer { @Override @@ -33,27 +38,13 @@ return new FreqProxTermsWriterPerThread(perThread); } - private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else if (0xffff == c1) - return 0; - } - } - @Override void closeDocStore(SegmentWriteState state) {} + @Override void abort() {} + private int flushedDocCount; // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields @@ -66,34 +57,42 @@ // Gather all FieldData's that have postings, across all // ThreadStates List allFields = new ArrayList(); + + flushedDocCount = state.numDocs; - for (Map.Entry> entry : threadsAndFields.entrySet()) { + Iterator>> it = threadsAndFields.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry> entry = it.next(); + Collection fields = entry.getValue(); + Iterator fieldsIt = fields.iterator(); - for (final TermsHashConsumerPerField i : fields) { - final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i; + while(fieldsIt.hasNext()) { + FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) fieldsIt.next(); if (perField.termsHashPerField.numPostings > 0) allFields.add(perField); } } + final int numAllFields = allFields.size(); + // Sort by field name Collections.sort(allFields); - final int numAllFields = allFields.size(); - // TODO: allow Lucene user to customize this consumer: - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + // TODO: allow Lucene user to customize this codec: + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + /* Current writer chain: - FormatPostingsFieldsConsumer - -> IMPL: FormatPostingsFieldsWriter - -> FormatPostingsTermsConsumer - -> IMPL: FormatPostingsTermsWriter - -> FormatPostingsDocConsumer - -> IMPL: FormatPostingsDocWriter - -> FormatPostingsPositionsConsumer + FieldsConsumer + -> IMPL: FormatPostingsTermsDictWriter + -> TermsConsumer + -> IMPL: FormatPostingsTermsDictWriter.TermsWriter + -> DocsConsumer + -> IMPL: FormatPostingsDocsWriter + -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ @@ -134,25 +133,29 @@ FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } - - consumer.finish(); + consumer.close(); } - private byte[] payloadBuffer; + BytesRef payload; /* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, - FormatPostingsFieldsConsumer consumer) + FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; + final BytesRef text = new BytesRef(); + final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; + final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); + final Comparator termComp = termsConsumer.getComparator(); + for(int i=0;iNOTE: extensions used by codecs are not + * listed here. You must interact with the {@link Codec} + * directly. + * * @lucene.internal */ + public final class IndexFileNames { /** Name of the index segment file */ public static final String SEGMENTS = "segments"; + /** Extension of gen file */ + public static final String GEN_EXTENSION = "gen"; + /** Name of the generation reference file name */ - public static final String SEGMENTS_GEN = "segments.gen"; + public static final String SEGMENTS_GEN = "segments." + GEN_EXTENSION; /** Name of the index deletable file (only used in * pre-lockless indices) */ @@ -43,18 +53,6 @@ /** Extension of norms file */ public static final String NORMS_EXTENSION = "nrm"; - /** Extension of freq postings file */ - public static final String FREQ_EXTENSION = "frq"; - - /** Extension of prox postings file */ - public static final String PROX_EXTENSION = "prx"; - - /** Extension of terms file */ - public static final String TERMS_EXTENSION = "tis"; - - /** Extension of terms index file */ - public static final String TERMS_INDEX_EXTENSION = "tii"; - /** Extension of stored fields index file */ public static final String FIELDS_INDEX_EXTENSION = "fdx"; @@ -88,9 +86,6 @@ /** Extension of separate norms */ public static final String SEPARATE_NORMS_EXTENSION = "s"; - /** Extension of gen file */ - public static final String GEN_EXTENSION = "gen"; - /** * This array contains all filename extensions used by * Lucene's index files, with two exceptions, namely the @@ -104,10 +99,6 @@ FIELD_INFOS_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, DELETES_EXTENSION, VECTORS_INDEX_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, @@ -117,22 +108,6 @@ COMPOUND_FILE_STORE_EXTENSION, }; - /** File extensions that are added to a compound file - * (same as above, minus "del", "gen", "cfs"). */ - public static final String[] INDEX_EXTENSIONS_IN_COMPOUND_FILE = new String[] { - FIELD_INFOS_EXTENSION, - FIELDS_INDEX_EXTENSION, - FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, - VECTORS_INDEX_EXTENSION, - VECTORS_DOCUMENTS_EXTENSION, - VECTORS_FIELDS_EXTENSION, - NORMS_EXTENSION - }; - public static final String[] STORE_INDEX_EXTENSIONS = new String[] { VECTORS_INDEX_EXTENSION, VECTORS_FIELDS_EXTENSION, @@ -143,22 +118,13 @@ public static final String[] NON_STORE_INDEX_EXTENSIONS = new String[] { FIELD_INFOS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, - TERMS_EXTENSION, - TERMS_INDEX_EXTENSION, NORMS_EXTENSION }; - /** File extensions of old-style index files */ - public static final String COMPOUND_EXTENSIONS[] = new String[] { + static final String COMPOUND_EXTENSIONS_NOT_CODEC[] = new String[] { FIELD_INFOS_EXTENSION, - FREQ_EXTENSION, - PROX_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, - TERMS_INDEX_EXTENSION, - TERMS_EXTENSION }; /** File extensions for term vector support */ @@ -222,6 +188,7 @@ */ public static final String segmentFileName(String segmentName, String ext) { if (ext.length() > 0) { + assert !ext.startsWith("."); return new StringBuilder(segmentName.length() + 1 + ext.length()).append( segmentName).append('.').append(ext).toString(); } else { Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 931099) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -20,7 +20,11 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.*; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; // for javadocs import java.io.File; import java.io.FileOutputStream; @@ -213,7 +217,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException { - return open(directory, null, null, true, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, null, null, true, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Returns an IndexReader reading the index in the given @@ -227,7 +231,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in the given @@ -241,7 +245,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, boolean readOnly) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -259,7 +263,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -287,7 +291,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor); + return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor, null); } /** Expert: returns an IndexReader reading the index in @@ -307,7 +311,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR, null); } /** Expert: returns an IndexReader reading the index in @@ -337,13 +341,80 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor); + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor, null); } - private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { - return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor); + /** Expert: returns an IndexReader reading the index in + * the given Directory, with a custom {@link + * IndexDeletionPolicy}, and specified {@link CodecProvider}. + * You should pass readOnly=true, since it gives much + * better concurrent performance, unless you intend to do + * write operations (delete documents or change norms) + * with the reader. + * @param directory the index directory + * @param deletionPolicy a custom deletion policy (only used + * if you use this reader to perform deletes or to set + * norms); see {@link IndexWriter} for details. + * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader + * @param termInfosIndexDivisor Subsamples which indexed + * terms are loaded into RAM. This has the same effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1. Set this + * to -1 to skip loading the terms index entirely. + * @param codecs CodecProvider to use when opening index + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor, codecs); } + /** Expert: returns an IndexReader reading the index in + * the given Directory, using a specific commit and with + * a custom {@link IndexDeletionPolicy} and specified + * {@link CodecProvider}. You should pass readOnly=true, since + * it gives much better concurrent performance, unless + * you intend to do write operations (delete documents or + * change norms) with the reader. + + * @param commit the specific {@link IndexCommit} to open; + * see {@link IndexReader#listCommits} to list all commits + * in a directory + * @param deletionPolicy a custom deletion policy (only used + * if you use this reader to perform deletes or to set + * norms); see {@link IndexWriter} for details. + * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader + * @param termInfosIndexDivisor Subsamples which indexed + * terms are loaded into RAM. This has the same effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1. Set this + * to -1 to skip loading the terms index entirely. + * @param codecs CodecProvider to use when opening index + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor, CodecProvider codecs) throws CorruptIndexException, IOException { + return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor, codecs); + } + + private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor, + CodecProvider codecs) throws CorruptIndexException, IOException { + if (codecs == null) { + codecs = CodecProvider.getDefault(); + } + return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor, codecs); + } + /** * Refreshes an IndexReader if the index has changed since this instance * was (re)opened. @@ -483,7 +554,7 @@ * @throws IOException if there is a low-level IO error */ public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentVersion(directory); + return SegmentInfos.readCurrentVersion(directory, CodecProvider.getDefault()); } /** @@ -501,7 +572,7 @@ * @see #getCommitUserData() */ public static Map getCommitUserData(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentUserData(directory); + return SegmentInfos.readCurrentUserData(directory, CodecProvider.getDefault()); } /** @@ -803,24 +874,63 @@ * calling terms(), {@link TermEnum#next()} must be called * on the resulting enumeration before calling other methods such as * {@link TermEnum#term()}. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermEnum terms() throws IOException; + /** Flex API: returns {@link Fields} for this reader. + * This method may return null if the reader has no + * postings. + * + *

    NOTE: if this is a multi reader ({@link + * #getSequentialSubReaders} is not null) then this + * method will throw UnsupportedOperationException. If + * you really need a {@link Fields} for such a reader, + * use {@link MultiFields#getFields}. However, for + * performance reasons, it's best to get all sub-readers + * using {@link ReaderUtil#gatherSubReaders} and iterate + * through them yourself. */ + public Fields fields() throws IOException { + return new LegacyFields(this); + } + /** Returns an enumeration of all terms starting at a given term. If * the given term does not exist, the enumeration is positioned at the * first term greater than the supplied term. The enumeration is * ordered by Term.compareTo(). Each term is greater than all that * precede it in the enumeration. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermEnum terms(Term t) throws IOException; /** Returns the number of documents containing the term t. * @throws IOException if there is a low-level IO error + * @deprecated Use {@link #docFreq(String,BytesRef)} instead. */ + @Deprecated public abstract int docFreq(Term t) throws IOException; + /** Returns the number of documents containing the term + * t. This method returns 0 if the term or + * field does not exists. This method does not take into + * account deleted documents that have not yet been merged + * away. */ + public int docFreq(String field, BytesRef term) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return 0; + } + final Terms terms = fields.terms(field); + if (terms == null) { + return 0; + } + return terms.docFreq(term); + } + /** Returns an enumeration of all the documents which contain * term. For each document, the document number, the frequency of * the term in that document is also provided, for use in @@ -832,8 +942,10 @@ * *

    The enumeration is ordered by document number. Each document number * is greater than all that precede it in the enumeration. + * @deprecated Use the new flex API ({@link #termDocsEnum}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public TermDocs termDocs(Term term) throws IOException { ensureOpen(); TermDocs termDocs = termDocs(); @@ -841,9 +953,57 @@ return termDocs; } + /** This may return null if the field does not exist.*/ + public Terms terms(String field) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return null; + } + return fields.terms(field); + } + + /** Returns {@link DocsEnum} for the specified field & + * term. This may return null, if either the field or + * term does not exist. */ + public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Fields fields = fields(); + if (fields == null) { + return null; + } + final Terms terms = fields.terms(field); + if (terms != null) { + return terms.docs(skipDocs, term, null); + } else { + return null; + } + } + + /** Returns {@link DocsAndPositionsEnum} for the specified + * field & term. This may return null, if either the + * field or term does not exist, or, positions were not + * stored for this term. */ + public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + assert field != null; + assert term != null; + final Fields fields = fields(); + if (fields == null) { + return null; + } + final Terms terms = fields.terms(field); + if (terms != null) { + return terms.docsAndPositions(skipDocs, term, null); + } else { + return null; + } + } + /** Returns an unpositioned {@link TermDocs} enumerator. + * @deprecated Use the new flex API ({@link #fields()}) instead. * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermDocs termDocs() throws IOException; /** Returns an enumeration of all the documents which contain @@ -861,8 +1021,11 @@ *

    This positional information facilitates phrase and proximity searching. *

    The enumeration is ordered by document number. Each document number is * greater than all that precede it in the enumeration. + * @deprecated Please switch the flex API ({@link + * #termDocsEnum}) instead * @throws IOException if there is a low-level IO error */ + @Deprecated public TermPositions termPositions(Term term) throws IOException { ensureOpen(); TermPositions termPositions = termPositions(); @@ -871,14 +1034,17 @@ } /** Returns an unpositioned {@link TermPositions} enumerator. + * @deprecated Please switch the flex API ({@link + * #termDocsEnum}) instead * @throws IOException if there is a low-level IO error */ + @Deprecated public abstract TermPositions termPositions() throws IOException; /** Deletes the document numbered docNum. Once a document is - * deleted it will not appear in TermDocs or TermPostitions enumerations. + * deleted it will not appear in TermDocs or TermPositions enumerations. * Attempts to read its field with the {@link #document} * method will result in an error. The presence of this document may still be * reflected in the {@link #docFreq} statistic, though @@ -1009,9 +1175,7 @@ * @throws IOException if there is a low-level IO error */ public final synchronized void commit(Map commitUserData) throws IOException { - if (hasChanges) { - doCommit(commitUserData); - } + doCommit(commitUserData); hasChanges = false; } @@ -1044,7 +1208,49 @@ */ public abstract Collection getFieldNames(FieldOption fldOption); + // Only used by external subclasses of IndexReader; all + // internal classes should implement Bits more + // efficiently: + private final class DeletedDocsBits implements Bits { + public boolean get(int docID) { + return isDeleted(docID); + } + public int length() { + return maxDoc(); + } + } + + private Bits deletedDocsBits; + + /** Returns the {@link Bits} representing deleted docs. A + * set bit indicates the doc ID has been deleted. This + * method should return null when there are no deleted + * docs. + * + * @lucene.experimental */ + public Bits getDeletedDocs() throws IOException { + if (deletedDocsBits == null) { + deletedDocsBits = new DeletedDocsBits(); + } + return deletedDocsBits; + } + /** + * Forcibly unlocks the index in the named directory. + *

    + * Caution: this should only be used by failure recovery code, + * when it is known that no other process nor thread is in fact + * currently accessing this index. + * @deprecated Please use {@link IndexWriter#unlock(Directory)} instead. + * This method will be removed in the 3.0 release. + * + */ + @Deprecated + public static void unlock(Directory directory) throws IOException { + directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release(); + } + + /** * Expert: return the IndexCommit that this reader has * opened. This method is only implemented by those * readers that correspond to a Directory with its own @@ -1169,6 +1375,12 @@ return null; } + + /** Expert: returns the docID base for this subReader. */ + public int getSubReaderDocBase(IndexReader subReader) { + throw new UnsupportedOperationException(); + } + /** Expert */ public Object getFieldCacheKey() { return this; @@ -1177,17 +1389,26 @@ /** Returns the number of unique terms (across all fields) * in this reader. * - * This method returns long, even though internally - * Lucene cannot handle more than 2^31 unique terms, for - * a possible future when this limitation is removed. - * * @throws UnsupportedOperationException if this count * cannot be easily determined (eg Multi*Readers). * Instead, you should call {@link * #getSequentialSubReaders} and ask each sub reader for * its unique term count. */ public long getUniqueTermCount() throws IOException { - throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); + long numTerms = 0; + final Fields fields = fields(); + if (fields == null) { + return 0; + } + FieldsEnum it = fields.iterator(); + while(true) { + String field = it.next(); + if (field == null) { + break; + } + numTerms += fields.terms(field).getUniqueTermCount(); + } + return numTerms; } /** For IndexReader implementations that use @@ -1198,4 +1419,29 @@ public int getTermInfosIndexDivisor() { throw new UnsupportedOperationException("This reader does not support this method."); } + + + private Fields fields; + + /** lucene.internal */ + void storeFields(Fields fields) { + this.fields = fields; + } + + /** lucene.internal */ + Fields retrieveFields() { + return fields; + } + + private Bits storedDelDocs; + + /** lucene.internal */ + void storeDelDocs(Bits delDocs) { + this.storedDelDocs = delDocs; + } + + /** lucene.internal */ + Bits retrieveDelDocs() { + return storedDelDocs; + } } Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.Constants; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.Version; @@ -232,12 +233,13 @@ public final static int DEFAULT_TERM_INDEX_INTERVAL = IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL; /** - * Absolute hard maximum length for a term. If a term - * arrives from the analyzer longer than this length, it - * is skipped and a message is printed to infoStream, if - * set (see {@link #setInfoStream}). + * Absolute hard maximum length for a term, in bytes once + * encoded as UTF8. If a term arrives from the analyzer + * longer than this length, it is skipped and a message is + * printed to infoStream, if set (see {@link + * #setInfoStream}). */ - public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH; + public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH_UTF8; // The normal read buffer size defaults to 1024, but // increasing this during merging seems to yield @@ -334,7 +336,7 @@ * *

    Note that this is functionally equivalent to calling * {#commit} and then using {@link IndexReader#open} to - * open a new reader. But the turarnound time of this + * open a new reader. But the turnaround time of this * method should be faster since it avoids the potentially * costly {@link #commit}.

    * @@ -420,7 +422,7 @@ // just like we do when loading segments_N synchronized(this) { applyDeletes(); - final IndexReader r = new ReadOnlyDirectoryReader(this, segmentInfos, termInfosIndexDivisor); + final IndexReader r = new ReadOnlyDirectoryReader(this, segmentInfos, termInfosIndexDivisor, codecs); if (infoStream != null) { message("return reader version=" + r.getVersion() + " reader=" + r); } @@ -629,7 +631,7 @@ // TODO: we may want to avoid doing this while // synchronized // Returns a ref, which we xfer to readerMap: - sr = SegmentReader.get(false, info.dir, info, readBufferSize, doOpenStores, termsIndexDivisor); + sr = SegmentReader.get(false, info.dir, info, readBufferSize, doOpenStores, termsIndexDivisor, codecs); if (info.dir == directory) { // Only pool if reader is not external @@ -639,7 +641,7 @@ if (doOpenStores) { sr.openDocStores(); } - if (termsIndexDivisor != -1 && !sr.termsIndexLoaded()) { + if (termsIndexDivisor != -1) { // If this reader was originally opened because we // needed to merge it, we didn't load the terms // index. But now, if the caller wants the terms @@ -1038,6 +1040,8 @@ .setOpenMode(OpenMode.APPEND).setMaxFieldLength(mfl.getLimit()) .setIndexDeletionPolicy(deletionPolicy).setIndexCommit(commit)); } + + CodecProvider codecs; /** * Constructs a new IndexWriter per the settings given in conf. @@ -1081,6 +1085,8 @@ mergePolicy.setIndexWriter(this); mergeScheduler = conf.getMergeScheduler(); mergedSegmentWarmer = conf.getMergedSegmentWarmer(); + codecs = conf.getCodecProvider(); + poolReaders = conf.getReaderPooling(); OpenMode mode = conf.getOpenMode(); @@ -1111,7 +1117,7 @@ // segments_N file with no segments: boolean doCommit; try { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); segmentInfos.clear(); doCommit = false; } catch (IOException e) { @@ -1129,7 +1135,7 @@ changeCount++; } } else { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); IndexCommit commit = conf.getIndexCommit(); if (commit != null) { @@ -1141,7 +1147,7 @@ if (commit.getDirectory() != directory) throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory"); SegmentInfos oldInfos = new SegmentInfos(); - oldInfos.read(directory, commit.getSegmentsFileName()); + oldInfos.read(directory, commit.getSegmentsFileName(), codecs); segmentInfos.replace(oldInfos); changeCount++; if (infoStream != null) @@ -1159,7 +1165,7 @@ // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, conf.getIndexDeletionPolicy(), - segmentInfos, infoStream, docWriter); + segmentInfos, infoStream, docWriter, this.codecs); if (deleter.startingCommitDeleted) // Deletion policy deleted the "head" commit point. @@ -1174,6 +1180,7 @@ pushMaxBufferedDocs(); if (infoStream != null) { + message("init: create=" + create); messageState(); } @@ -3135,7 +3142,7 @@ } SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); + sis.read(dirs[i], codecs); for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); assert !segmentInfos.contains(info): "dup info dir=" + info.dir + " name=" + info.name; @@ -3321,7 +3328,7 @@ try { mergedName = newSegmentName(); - merger = new SegmentMerger(this, mergedName, null); + merger = new SegmentMerger(directory, termIndexInterval, mergedName, null, codecs); SegmentReader sReader = null; synchronized(this) { @@ -3344,7 +3351,7 @@ synchronized(this) { segmentInfos.clear(); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false, merger.hasProx()); + -1, null, false, merger.hasProx(), merger.getCodec()); setDiagnostics(info, "addIndexes(IndexReader...)"); segmentInfos.add(info); } @@ -3391,7 +3398,7 @@ startTransaction(false); try { - merger.createCompoundFile(mergedName + ".cfs"); + merger.createCompoundFile(mergedName + ".cfs", info); synchronized(this) { info.setUseCompoundFile(true); } @@ -3742,7 +3749,9 @@ directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - docWriter.hasProx()); + docWriter.hasProx(), + docWriter.getCodec()); + setDiagnostics(newSegment, "flush"); } @@ -3956,7 +3965,7 @@ } } } - + merge.info.setHasProx(merger.hasProx()); segmentInfos.subList(start, start + merge.segments.size()).clear(); @@ -4032,7 +4041,7 @@ mergeInit(merge); if (infoStream != null) - message("now merge\n merge=" + merge.segString(directory) + "\n merge=" + merge + "\n index=" + segString()); + message("now merge\n merge=" + merge.segString(directory) + "\n index=" + segString()); mergeMiddle(merge); mergeSuccess(merge); @@ -4258,7 +4267,8 @@ docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - false); + false, + null); Map details = new HashMap(); @@ -4338,7 +4348,7 @@ if (infoStream != null) message("merging " + merge.segString(directory)); - merger = new SegmentMerger(this, mergedName, merge); + merger = new SegmentMerger(directory, termIndexInterval, mergedName, merge, codecs); merge.readers = new SegmentReader[numSegments]; merge.readersClone = new SegmentReader[numSegments]; @@ -4411,8 +4421,17 @@ // This is where all the work happens: mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores); + // Record which codec was used to write the segment + merge.info.setCodec(merger.getCodec()); + assert mergedDocCount == totDocCount; + // Very important to do this before opening the reader + // because codec must know if prox was written for + // this segment: + //System.out.println("merger set hasProx=" + merger.hasProx() + " seg=" + merge.info.name); + merge.info.setHasProx(merger.hasProx()); + // TODO: in the non-realtime case, we may want to only // keep deletes (it's costly to open entire reader // when we just need deletes) @@ -4450,8 +4469,9 @@ merge.readersClone[i].close(); } catch (Throwable t) { } - // This was a private clone and we had the only reference - assert merge.readersClone[i].getRefCount() == 0; + // This was a private clone and we had the + // only reference + assert merge.readersClone[i].getRefCount() == 0: "refCount should be 0 but is " + merge.readersClone[i].getRefCount(); } } } else { @@ -4484,7 +4504,7 @@ final String compoundFileName = IndexFileNames.segmentFileName(mergedName, IndexFileNames.COMPOUND_FILE_EXTENSION); try { - merger.createCompoundFile(compoundFileName); + merger.createCompoundFile(compoundFileName, merge.info); success = true; } catch (IOException ioe) { synchronized(this) { Index: src/java/org/apache/lucene/index/IndexWriterConfig.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriterConfig.java (revision 931099) +++ src/java/org/apache/lucene/index/IndexWriterConfig.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.DocumentsWriter.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.Version; @@ -78,6 +79,9 @@ */ public static long WRITE_LOCK_TIMEOUT = 1000; + /** Default {@link CodecProvider}. */ + public final static CodecProvider DEFAULT_CODEC_PROVIDER = CodecProvider.getDefault(); + /** The maximum number of simultaneous threads that may be * indexing documents at once in IndexWriter; if more * than this many threads arrive they will wait for @@ -119,6 +123,7 @@ private int maxBufferedDocs; private IndexingChain indexingChain; private IndexReaderWarmer mergedSegmentWarmer; + private CodecProvider codecProvider; private MergePolicy mergePolicy; private int maxThreadStates; private boolean readerPooling; @@ -149,6 +154,7 @@ maxBufferedDocs = DEFAULT_MAX_BUFFERED_DOCS; indexingChain = DocumentsWriter.defaultIndexingChain; mergedSegmentWarmer = null; + codecProvider = DEFAULT_CODEC_PROVIDER; mergePolicy = new LogByteSizeMergePolicy(); maxThreadStates = DEFAULT_MAX_THREAD_STATES; readerPooling = DEFAULT_READER_POOLING; @@ -509,6 +515,18 @@ this.mergePolicy = mergePolicy == null ? new LogByteSizeMergePolicy() : mergePolicy; return this; } + + /** Set the CodecProvider. See {@link CodecProvider}. */ + public IndexWriterConfig setCodecProvider(CodecProvider codecProvider) { + this.codecProvider = codecProvider; + return this; + } + + /** Returns the current merged segment warmer. See {@link IndexReaderWarmer}. */ + public CodecProvider getCodecProvider() { + return codecProvider; + } + /** * Returns the current MergePolicy in use by this writer. @@ -584,6 +602,7 @@ sb.append("ramBufferSizeMB=").append(ramBufferSizeMB).append("\n"); sb.append("maxBufferedDocs=").append(maxBufferedDocs).append("\n"); sb.append("mergedSegmentWarmer=").append(mergedSegmentWarmer).append("\n"); + sb.append("codecProvider=").append(codecProvider).append("\n"); sb.append("mergePolicy=").append(mergePolicy).append("\n"); sb.append("maxThreadStates=").append(maxThreadStates).append("\n"); sb.append("readerPooling=").append(readerPooling).append("\n"); Index: src/java/org/apache/lucene/index/MultipleTermPositions.java =================================================================== --- src/java/org/apache/lucene/index/MultipleTermPositions.java (revision 931099) +++ src/java/org/apache/lucene/index/MultipleTermPositions.java (working copy) @@ -28,8 +28,10 @@ /** * Allows you to iterate over the {@link TermPositions} for multiple {@link Term}s as * a single {@link TermPositions}. - * + * @deprecated This class is being replaced by the package + * private MultiDocsEnum on org.apache.lucene.search. */ +@Deprecated public class MultipleTermPositions implements TermPositions { private static final class TermPositionsQueue extends PriorityQueue { Index: src/java/org/apache/lucene/index/MultiReader.java =================================================================== --- src/java/org/apache/lucene/index/MultiReader.java (revision 931099) +++ src/java/org/apache/lucene/index/MultiReader.java (working copy) @@ -25,17 +25,21 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.index.DirectoryReader.MultiTermDocs; -import org.apache.lucene.index.DirectoryReader.MultiTermEnum; -import org.apache.lucene.index.DirectoryReader.MultiTermPositions; +import org.apache.lucene.index.DirectoryReader.MultiTermDocs; // deprecated +import org.apache.lucene.index.DirectoryReader.MultiTermEnum; // deprecated +import org.apache.lucene.index.DirectoryReader.MultiTermPositions; // deprecated import org.apache.lucene.search.Similarity; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; /** An IndexReader which reads multiple indexes, appending - * their content. */ + * their content. */ public class MultiReader extends IndexReader implements Cloneable { protected IndexReader[] subReaders; private int[] starts; // 1st docno for each segment + private final Map subReaderToSlice = new HashMap(); private boolean[] decrefOnClose; // remember which subreaders to decRef on close private Map normsCache = new HashMap(); private int maxDoc = 0; @@ -49,7 +53,7 @@ *

    Note that all subreaders are closed if this Multireader is closed.

    * @param subReaders set of (sub)readers */ - public MultiReader(IndexReader... subReaders) { + public MultiReader(IndexReader... subReaders) throws IOException { initialize(subReaders, true); } @@ -61,14 +65,15 @@ * when this MultiReader is closed * @param subReaders set of (sub)readers */ - public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) { + public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException { initialize(subReaders, closeSubReaders); } - private void initialize(IndexReader[] subReaders, boolean closeSubReaders) { + private void initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException { this.subReaders = subReaders.clone(); starts = new int[subReaders.length + 1]; // build starts array decrefOnClose = new boolean[subReaders.length]; + for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs @@ -80,12 +85,34 @@ decrefOnClose[i] = false; } - if (subReaders[i].hasDeletions()) + if (subReaders[i].hasDeletions()) { hasDeletions = true; + } + + final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i], + subReaders[i].maxDoc(), + i); + subReaderToSlice.put(subReaders[i], slice); } + starts[subReaders.length] = maxDoc; } - + + @Override + public long getUniqueTermCount() throws IOException { + throw new UnsupportedOperationException(""); + } + + @Override + public int getSubReaderDocBase(IndexReader subReader) { + return subReaderToSlice.get(subReader).start; + } + + @Override + public Fields fields() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getFields if you really need a top level Fields (NOTE that it's usually better to work per segment instead)"); + } + /** * Tries to reopen the subreaders. *
    @@ -128,6 +155,11 @@ } } + @Override + public Bits getDeletedDocs() throws IOException { + throw new UnsupportedOperationException("please use MultiFields.getDeletedDocs if you really need a top level Bits deletedDocs (NOTE that it's usually better to work per segment instead)"); + } + /** * If clone is true then we clone each of the subreaders * @param doClone @@ -367,8 +399,18 @@ total += subReaders[i].docFreq(t); return total; } - + @Override + public int docFreq(String field, BytesRef t) throws IOException { + ensureOpen(); + int total = 0; // sum freqs in segments + for (int i = 0; i < subReaders.length; i++) { + total += subReaders[i].docFreq(field, t); + } + return total; + } + + @Override public TermDocs termDocs() throws IOException { ensureOpen(); if (subReaders.length == 1) { Index: src/java/org/apache/lucene/index/NormsWriter.java =================================================================== --- src/java/org/apache/lucene/index/NormsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/NormsWriter.java (working copy) @@ -35,6 +35,8 @@ * merges all of these together into a single _X.nrm file. */ +// TODO: Fix the unchecked collections, I do not understand the whole code here -- Uwe +@SuppressWarnings("unchecked") final class NormsWriter extends InvertedDocEndConsumer { private static final byte defaultNorm = Similarity.getDefault().encodeNormValue(1.0f); Index: src/java/org/apache/lucene/index/ParallelPostingsArray.java =================================================================== --- src/java/org/apache/lucene/index/ParallelPostingsArray.java (revision 931099) +++ src/java/org/apache/lucene/index/ParallelPostingsArray.java (working copy) @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.util.ArrayUtil; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -21,25 +23,49 @@ class ParallelPostingsArray { final static int BYTES_PER_POSTING = 3 * DocumentsWriter.INT_NUM_BYTE; + final int size; final int[] textStarts; final int[] intStarts; final int[] byteStarts; - - public ParallelPostingsArray(final int size) { + + ParallelPostingsArray(final int size) { + this.size = size; textStarts = new int[size]; intStarts = new int[size]; byteStarts = new int[size]; } - - ParallelPostingsArray resize(int newSize) { - ParallelPostingsArray newArray = new ParallelPostingsArray(newSize); - copy(this, newArray); + + int bytesPerPosting() { + return BYTES_PER_POSTING; + } + + ParallelPostingsArray newInstance(int size) { + return new ParallelPostingsArray(size); + } + + final ParallelPostingsArray grow() { + int newSize = ArrayUtil.oversize(size + 1, bytesPerPosting()); + ParallelPostingsArray newArray = newInstance(newSize); + copyTo(newArray, size); return newArray; } - - void copy(ParallelPostingsArray fromArray, ParallelPostingsArray toArray) { - System.arraycopy(fromArray.textStarts, 0, toArray.textStarts, 0, fromArray.textStarts.length); - System.arraycopy(fromArray.intStarts, 0, toArray.intStarts, 0, fromArray.intStarts.length); - System.arraycopy(fromArray.byteStarts, 0, toArray.byteStarts, 0, fromArray.byteStarts.length); + + final ParallelPostingsArray shrink(int targetSize, boolean doCopy) { + int shrinkSize = ArrayUtil.getShrinkSize(size, targetSize, bytesPerPosting()); + if (shrinkSize != size) { + ParallelPostingsArray newArray = newInstance(targetSize); + if (doCopy) { + copyTo(newArray, targetSize); + } + return newArray; + } else { + return this; + } } + + void copyTo(ParallelPostingsArray toArray, int numToCopy) { + System.arraycopy(textStarts, 0, toArray.textStarts, 0, numToCopy); + System.arraycopy(intStarts, 0, toArray.intStarts, 0, numToCopy); + System.arraycopy(byteStarts, 0, toArray.byteStarts, 0, numToCopy); + } } Index: src/java/org/apache/lucene/index/ParallelReader.java =================================================================== --- src/java/org/apache/lucene/index/ParallelReader.java (revision 931099) +++ src/java/org/apache/lucene/index/ParallelReader.java (working copy) @@ -21,7 +21,9 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.Bits; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.*; @@ -56,6 +58,8 @@ private int numDocs; private boolean hasDeletions; + private ParallelFields fields = new ParallelFields(); + /** Construct a ParallelReader. *

    Note that all subreaders are closed if this ParallelReader is closed.

    */ @@ -122,9 +126,11 @@ Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); readerToFields.put(reader, fields); - for (final String field : fields) { // update fieldToReader map - if (fieldToReader.get(field) == null) + for (final String field : fields) { // update fieldToReader map + if (fieldToReader.get(field) == null) { fieldToReader.put(field, reader); + } + this.fields.addField(field, reader); } if (!ignoreStoredFields) @@ -136,6 +142,67 @@ } decrefOnClose.add(Boolean.valueOf(incRefReaders)); } + + private class ParallelFieldsEnum extends FieldsEnum { + String currentField; + IndexReader currentReader; + Iterator keys; + + ParallelFieldsEnum() { + keys = fieldToReader.keySet().iterator(); + } + + @Override + public String next() throws IOException { + if (keys.hasNext()) { + currentField = (String) keys.next(); + currentReader = (IndexReader) fieldToReader.get(currentField); + } else { + currentField = null; + currentReader = null; + } + return currentField; + } + + @Override + public TermsEnum terms() throws IOException { + assert currentReader != null; + Terms terms = MultiFields.getTerms(currentReader, currentField); + if (terms != null) { + return terms.iterator(); + } else { + return TermsEnum.EMPTY; + } + } + } + + // Single instance of this, per ParallelReader instance + private class ParallelFields extends Fields { + final HashMap fields = new HashMap(); + + public void addField(String field, IndexReader r) throws IOException { + fields.put(field, MultiFields.getFields(r).terms(field)); + } + + @Override + public FieldsEnum iterator() throws IOException { + return new ParallelFieldsEnum(); + } + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + } + + @Override + public Bits getDeletedDocs() throws IOException { + return MultiFields.getDeletedDocs(readers.get(0)); + } + + @Override + public Fields fields() { + return fields; + } @Override public synchronized Object clone() { @@ -404,6 +471,13 @@ } @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader == null? 0 : reader.docFreq(field, term); + } + + @Override public TermDocs termDocs(Term term) throws IOException { ensureOpen(); return new ParallelTermDocs(term); @@ -501,6 +575,7 @@ return fieldSet; } + @Deprecated private class ParallelTermEnum extends TermEnum { private String field; private Iterator fieldIterator; Index: src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java =================================================================== --- src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java (revision 931099) +++ src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java (working copy) @@ -18,22 +18,23 @@ */ import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.Map; class ReadOnlyDirectoryReader extends DirectoryReader { - ReadOnlyDirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, int termInfosIndexDivisor) throws IOException { - super(directory, sis, deletionPolicy, true, termInfosIndexDivisor); + ReadOnlyDirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(directory, sis, deletionPolicy, true, termInfosIndexDivisor, codecs); } ReadOnlyDirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, Map oldNormsCache, boolean doClone, - int termInfosIndexDivisor) throws IOException { - super(directory, infos, oldReaders, oldStarts, oldNormsCache, true, doClone, termInfosIndexDivisor); + int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(directory, infos, oldReaders, oldStarts, oldNormsCache, true, doClone, termInfosIndexDivisor, codecs); } - ReadOnlyDirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor) throws IOException { - super(writer, infos, termInfosIndexDivisor); + ReadOnlyDirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor, CodecProvider codecs) throws IOException { + super(writer, infos, termInfosIndexDivisor, codecs); } @Override Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -21,9 +21,13 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BitVector; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.HashSet; import java.util.HashMap; import java.util.ArrayList; import java.util.Collections; @@ -87,10 +91,13 @@ // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false + + private Codec codec; + private Map diagnostics; - public SegmentInfo(String name, int docCount, Directory dir) { + public SegmentInfo(String name, int docCount, Directory dir, Codec codec) { this.name = name; this.docCount = docCount; this.dir = dir; @@ -103,15 +110,13 @@ docStoreIsCompoundFile = false; delCount = 0; hasProx = true; + this.codec = codec; } - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { - this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false, true); - } - - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, - int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx) { - this(name, docCount, dir); + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, + Codec codec) { + this(name, docCount, dir, codec); this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); this.hasSingleNormFile = hasSingleNormFile; preLockless = false; @@ -119,6 +124,7 @@ this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; + this.codec = codec; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } @@ -144,6 +150,7 @@ isCompoundFile = src.isCompoundFile; hasSingleNormFile = src.hasSingleNormFile; delCount = src.delCount; + codec = src.codec; } void setDiagnostics(Map diagnostics) { @@ -162,10 +169,11 @@ * @param format format of the segments info file * @param input input handle to read segment info from */ - SegmentInfo(Directory dir, int format, IndexInput input) throws IOException { + SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException { this.dir = dir; name = input.readString(); docCount = input.readInt(); + final String codecName; if (format <= SegmentInfos.FORMAT_LOCKLESS) { delGen = input.readLong(); if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) { @@ -208,6 +216,13 @@ else hasProx = true; + // System.out.println(Thread.currentThread().getName() + ": si.read hasProx=" + hasProx + " seg=" + name); + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + codecName = input.readString(); + else + codecName = "PreFlex"; + if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) { diagnostics = input.readStringStringMap(); } else { @@ -224,8 +239,10 @@ docStoreSegment = null; delCount = -1; hasProx = true; + codecName = "PreFlex"; diagnostics = Collections.emptyMap(); } + codec = codecs.lookup(codecName); } void setNumFields(int numFields) { @@ -309,7 +326,7 @@ @Override public Object clone () { - SegmentInfo si = new SegmentInfo(name, docCount, dir); + SegmentInfo si = new SegmentInfo(name, docCount, dir, codec); si.isCompoundFile = isCompoundFile; si.delGen = delGen; si.delCount = delCount; @@ -323,6 +340,7 @@ si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; + si.codec = codec; return si; } @@ -373,14 +391,12 @@ if (result == null) throw new IOException("cannot read directory " + dir + ": listAll() returned null"); - final IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); - String pattern; - pattern = name + ".s"; - int patternLength = pattern.length(); + final String pattern = name + ".s\\d+"; for(int i = 0; i < result.length; i++){ String fileName = result[i]; - if (filter.accept(null, fileName) && fileName.startsWith(pattern) && Character.isDigit(fileName.charAt(patternLength))) - return true; + if (fileName.matches(pattern)) { + return true; + } } return false; } @@ -550,6 +566,7 @@ output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); + output.writeString(codec.name); output.writeStringStringMap(diagnostics); } @@ -562,7 +579,20 @@ return hasProx; } - private void addIfExists(List files, String fileName) throws IOException { + /** Can only be called once. */ + public void setCodec(Codec codec) { + assert this.codec == null; + if (codec == null) { + throw new IllegalArgumentException("codec must be non-null"); + } + this.codec = codec; + } + + Codec getCodec() { + return codec; + } + + private void addIfExists(Set files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); } @@ -580,15 +610,17 @@ return files; } - files = new ArrayList(); + Set fileSet = new HashSet(); boolean useCompoundFile = getUseCompoundFile(); if (useCompoundFile) { - files.add(IndexFileNames.segmentFileName(name, IndexFileNames.COMPOUND_FILE_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(name, IndexFileNames.COMPOUND_FILE_EXTENSION)); } else { - for (String ext : IndexFileNames.NON_STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(name, ext)); + for(String ext : IndexFileNames.NON_STORE_INDEX_EXTENSIONS) { + addIfExists(fileSet, IndexFileNames.segmentFileName(name, ext)); + } + codec.files(dir, this, fileSet); } if (docStoreOffset != -1) { @@ -596,19 +628,19 @@ // vectors) with other segments assert docStoreSegment != null; if (docStoreIsCompoundFile) { - files.add(IndexFileNames.segmentFileName(docStoreSegment, IndexFileNames.COMPOUND_FILE_STORE_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, IndexFileNames.COMPOUND_FILE_STORE_EXTENSION)); } else { for (String ext : IndexFileNames.STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(docStoreSegment, ext)); + addIfExists(fileSet, IndexFileNames.segmentFileName(docStoreSegment, ext)); } } else if (!useCompoundFile) { for (String ext : IndexFileNames.STORE_INDEX_EXTENSIONS) - addIfExists(files, IndexFileNames.segmentFileName(name, ext)); + addIfExists(fileSet, IndexFileNames.segmentFileName(name, ext)); } String delFileName = IndexFileNames.fileNameFromGeneration(name, IndexFileNames.DELETES_EXTENSION, delGen); if (delFileName != null && (delGen >= YES || dir.fileExists(delFileName))) { - files.add(delFileName); + fileSet.add(delFileName); } // Careful logic for norms files @@ -617,14 +649,14 @@ long gen = normGen[i]; if (gen >= YES) { // Definitely a separate norm file, with generation: - files.add(IndexFileNames.fileNameFromGeneration(name, IndexFileNames.SEPARATE_NORMS_EXTENSION + i, gen)); + fileSet.add(IndexFileNames.fileNameFromGeneration(name, IndexFileNames.SEPARATE_NORMS_EXTENSION + i, gen)); } else if (NO == gen) { // No separate norms but maybe plain norms // in the non compound file case: if (!hasSingleNormFile && !useCompoundFile) { String fileName = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION + i); if (dir.fileExists(fileName)) { - files.add(fileName); + fileSet.add(fileName); } } } else if (CHECK_DIR == gen) { @@ -636,7 +668,7 @@ fileName = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION + i); } if (fileName != null && dir.fileExists(fileName)) { - files.add(fileName); + fileSet.add(fileName); } } } @@ -644,20 +676,24 @@ // Pre-2.1: we have to scan the dir to find all // matching _X.sN/_X.fN files for our segment: String prefix; - if (useCompoundFile) + if (useCompoundFile) { prefix = IndexFileNames.segmentFileName(name, IndexFileNames.SEPARATE_NORMS_EXTENSION); - else + } else { prefix = IndexFileNames.segmentFileName(name, IndexFileNames.PLAIN_NORMS_EXTENSION); - int prefixLength = prefix.length(); + } + final String pattern = prefix + "\\d+"; + String[] allFiles = dir.listAll(); - final IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); for(int i=0;i prefixLength && Character.isDigit(fileName.charAt(prefixLength)) && fileName.startsWith(prefix)) { - files.add(fileName); + if (fileName.matches(pattern)) { + fileSet.add(fileName); } } } + + files = new ArrayList(fileSet); + return files; } Index: src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfos.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.store.ChecksumIndexOutput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.NoSuchDirectoryException; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.util.ThreadInterruptedException; import java.io.FileNotFoundException; @@ -88,9 +89,13 @@ /** This format adds optional per-segment String * diagnostics storage, and switches userData to Map */ public static final int FORMAT_DIAGNOSTICS = -9; + + /** Each segment records whether its postings are written + * in the new flex format */ + public static final int FORMAT_FLEX_POSTINGS = -10; /* This must always point to the most recent file format. */ - static final int CURRENT_FORMAT = FORMAT_DIAGNOSTICS; + static final int CURRENT_FORMAT = FORMAT_FLEX_POSTINGS; public int counter = 0; // used to name new segments /** @@ -228,7 +233,8 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public final void read(Directory directory, String segmentFileName) throws CorruptIndexException, IOException { + public final void read(Directory directory, String segmentFileName, + CodecProvider codecs) throws CorruptIndexException, IOException { boolean success = false; // Clear any previous segments: @@ -254,7 +260,7 @@ } for (int i = input.readInt(); i > 0; i--) { // read segmentInfos - add(new SegmentInfo(directory, format, input)); + add(new SegmentInfo(directory, format, input, codecs)); } if(format >= 0){ // in old format the version number may be at the end of the file @@ -301,14 +307,17 @@ * @throws IOException if there is a low-level IO error */ public final void read(Directory directory) throws CorruptIndexException, IOException { - + read(directory, CodecProvider.getDefault()); + } + + public final void read(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException { generation = lastGeneration = -1; new FindSegmentsFile(directory) { @Override protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { - read(directory, segmentFileName); + read(directory, segmentFileName, codecs); return null; } }.run(); @@ -375,9 +384,11 @@ public Object clone() { SegmentInfos sis = (SegmentInfos) super.clone(); for(int i=0;i(userData); + sis.userData = new HashMap(userData); return sis; } @@ -399,7 +410,7 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public static long readCurrentVersion(Directory directory) + public static long readCurrentVersion(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException { // Fully read the segments file: this ensures that it's @@ -417,10 +428,10 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public static Map readCurrentUserData(Directory directory) + public static Map readCurrentUserData(Directory directory, CodecProvider codecs) throws CorruptIndexException, IOException { SegmentInfos sis = new SegmentInfos(); - sis.read(directory); + sis.read(directory, codecs); return sis.getUserData(); } Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMergeInfo.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentMergeInfo.java (working copy) @@ -1,85 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class SegmentMergeInfo { - Term term; - int base; - int ord; // the position of the segment in a MultiReader - TermEnum termEnum; - IndexReader reader; - int delCount; - private TermPositions postings; // use getPositions() - private int[] docMap; // use getDocMap() - - SegmentMergeInfo(int b, TermEnum te, IndexReader r) - throws IOException { - base = b; - reader = r; - termEnum = te; - term = te.term(); - } - - // maps around deleted docs - int[] getDocMap() { - if (docMap == null) { - delCount = 0; - // build array which maps document numbers around deletions - if (reader.hasDeletions()) { - int maxDoc = reader.maxDoc(); - docMap = new int[maxDoc]; - int j = 0; - for (int i = 0; i < maxDoc; i++) { - if (reader.isDeleted(i)) { - delCount++; - docMap[i] = -1; - } else - docMap[i] = j++; - } - } - } - return docMap; - } - - TermPositions getPositions() throws IOException { - if (postings == null) { - postings = reader.termPositions(); - } - return postings; - } - - final boolean next() throws IOException { - if (termEnum.next()) { - term = termEnum.term(); - return true; - } else { - term = null; - return false; - } - } - - final void close() throws IOException { - termEnum.close(); - if (postings != null) { - postings.close(); - } -} -} - Index: src/java/org/apache/lucene/index/SegmentMergeQueue.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMergeQueue.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentMergeQueue.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import org.apache.lucene.util.PriorityQueue; - -final class SegmentMergeQueue extends PriorityQueue { - SegmentMergeQueue(int size) { - initialize(size); - } - - @Override - protected final boolean lessThan(SegmentMergeInfo stiA, SegmentMergeInfo stiB) { - int comparison = stiA.term.compareTo(stiB.term); - if (comparison == 0) - return stiA.base < stiB.base; - else - return comparison < 0; - } - - final void close() throws IOException { - while (top() != null) - pop().close(); - } - -} Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -20,15 +20,23 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; - +import java.util.Set; +import java.util.HashSet; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.MergePolicy.MergeAbortedException; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.MergeState; +import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.MultiBits; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -66,26 +74,15 @@ /** Maximum number of contiguous documents to bulk-copy when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; + + private final CodecProvider codecs; + private Codec codec; + private SegmentWriteState segmentWriteState; - /** This ctor used only by test code. - * - * @param dir The Directory to merge the other segments into - * @param name The name of the new segment - */ - SegmentMerger(Directory dir, String name) { + SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs) { directory = dir; + this.codecs = codecs; segment = name; - checkAbort = new CheckAbort(null, null) { - @Override - public void work(double units) throws MergeAbortedException { - // do nothing - } - }; - } - - SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { - directory = writer.getDirectory(); - segment = name; if (merge != null) { checkAbort = new CheckAbort(merge, directory); } else { @@ -96,7 +93,7 @@ } }; } - termIndexInterval = writer.getConfig().getTermIndexInterval(); + this.termIndexInterval = termIndexInterval; } boolean hasProx() { @@ -171,30 +168,27 @@ } } - final List createCompoundFile(String fileName) + final List createCompoundFile(String fileName, final SegmentInfo info) throws IOException { - CompoundFileWriter cfsWriter = - new CompoundFileWriter(directory, fileName, checkAbort); + CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); - List files = - new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); - - // Basic files - for (String ext : IndexFileNames.COMPOUND_EXTENSIONS) { - if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) - continue; + Set fileSet = new HashSet(); + // Basic files + for (String ext : IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC) { if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && - !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) - files.add(IndexFileNames.segmentFileName(segment, ext)); + !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) + fileSet.add(IndexFileNames.segmentFileName(segment, ext)); } + codec.files(directory, info, fileSet); + // Fieldable norm files int numFIs = fieldInfos.size(); for (int i = 0; i < numFIs; i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { - files.add(IndexFileNames.segmentFileName(segment, IndexFileNames.NORMS_EXTENSION)); + fileSet.add(IndexFileNames.segmentFileName(segment, IndexFileNames.NORMS_EXTENSION)); break; } } @@ -202,19 +196,19 @@ // Vector files if (fieldInfos.hasVectors() && mergeDocStores) { for (String ext : IndexFileNames.VECTOR_EXTENSIONS) { - files.add(IndexFileNames.segmentFileName(segment, ext)); + fileSet.add(IndexFileNames.segmentFileName(segment, ext)); } } // Now merge all added files - for (String file : files) { + for (String file : fileSet) { cfsWriter.addFile(file); } // Perform the merge cfsWriter.close(); - return files; + return new ArrayList(fileSet); } private void addIndexed(IndexReader reader, FieldInfos fInfos, @@ -351,14 +345,17 @@ // details. throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption"); - } else + } else { // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount for (final IndexReader reader : readers) { docCount += reader.numDocs(); } + } + segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, null, docCount, 0, termIndexInterval, codecs); + return docCount; } @@ -552,156 +549,116 @@ } } - private SegmentMergeQueue queue = null; + Codec getCodec() { + return codec; + } private final void mergeTerms() throws CorruptIndexException, IOException { - SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + // Let CodecProvider decide which codec will be used to write + // the new segment: + codec = codecs.getWriter(segmentWriteState); + + int docBase = 0; - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + final List fields = new ArrayList(); + final List subReaders = new ArrayList(); + final List slices = new ArrayList(); + final List bits = new ArrayList(); + final List bitsStarts = new ArrayList(); - try { - queue = new SegmentMergeQueue(readers.size()); - - mergeTermInfos(consumer); - - } finally { - consumer.finish(); - if (queue != null) queue.close(); + final int numReaders = readers.size(); + for(int i=0;i 0) { - int matchSize = 0; // pop matching terms - match[matchSize++] = queue.pop(); - Term term = match[0].term; - SegmentMergeInfo top = queue.top(); + starts[i] = inputDocBase; - while (top != null && term.compareTo(top.term) == 0) { - match[matchSize++] = queue.pop(); - top = queue.top(); + mergeState.delCounts[i] = reader.numDeletedDocs(); + mergeState.docBase[i] = docBase; + docBase += reader.numDocs(); + inputDocBase += reader.maxDoc(); + if (mergeState.delCounts[i] != 0) { + int delCount = 0; + Bits deletedDocs = reader.getDeletedDocs(); + final int maxDoc = reader.maxDoc(); + final int[] docMap = mergeState.docMaps[i] = new int[maxDoc]; + int newDocID = 0; + for(int j=0;j files() throws IOException { return new ArrayList(si.files()); } - + @Override - public TermEnum terms() { + public TermEnum terms() throws IOException { ensureOpen(); - return core.getTermsReader().terms(); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + return ((PreFlexFields) core.fields).tis.terms(); + } else { + // Emulate pre-flex API on top of flex index + return new LegacyTermEnum(null); + } } + /** @deprecated Please switch to the flex API ({@link + * #fields}) instead. */ + @Deprecated @Override public TermEnum terms(Term t) throws IOException { ensureOpen(); - return core.getTermsReader().terms(t); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + return ((PreFlexFields) core.fields).tis.terms(t); + } else { + // Emulate pre-flex API on top of flex index + return new LegacyTermEnum(t); + } } FieldInfos fieldInfos() { @@ -887,6 +870,9 @@ return (deletedDocs != null && deletedDocs.get(n)); } + /** @deprecated Switch to the flex API ({@link + * IndexReader#termDocsEnum}) instead. */ + @Deprecated @Override public TermDocs termDocs(Term term) throws IOException { if (term == null) { @@ -895,30 +881,76 @@ return super.termDocs(term); } } + + @Override + public Fields fields() throws IOException { + return core.fields; + } + /** @deprecated Switch to the flex API {@link + * IndexReader#termDocsEnum} instead. */ + @Deprecated @Override public TermDocs termDocs() throws IOException { ensureOpen(); - return new SegmentTermDocs(this); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + final PreFlexFields pre = (PreFlexFields) core.fields; + SegmentTermDocs std = new SegmentTermDocs(pre.freqStream, pre.tis, core.fieldInfos); + std.setSkipDocs(deletedDocs); + return std; + } else { + // Emulate old API + return new LegacyTermDocs(); + } } + /** @deprecated Switch to the flex API {@link + * IndexReader#termDocsEnum} instead */ + @Deprecated @Override public TermPositions termPositions() throws IOException { ensureOpen(); - return new SegmentTermPositions(this); + if (core.isPreFlex) { + // For old API on an old segment, instead of + // converting old API -> new API -> old API, just give + // direct access to old: + final PreFlexFields pre = (PreFlexFields) core.fields; + SegmentTermPositions stp = new SegmentTermPositions(pre.freqStream, pre.proxStream, pre.tis, core.fieldInfos); + stp.setSkipDocs(deletedDocs); + return stp; + } else { + // Emulate old API + return new LegacyTermPositions(); + } } @Override public int docFreq(Term t) throws IOException { ensureOpen(); - TermInfo ti = core.getTermsReader().get(t); - if (ti != null) - return ti.docFreq; - else + Terms terms = core.fields.terms(t.field); + if (terms != null) { + return terms.docFreq(new BytesRef(t.text)); + } else { return 0; + } } @Override + public int docFreq(String field, BytesRef term) throws IOException { + ensureOpen(); + + Terms terms = core.fields.terms(field); + if (terms != null) { + return terms.docFreq(term); + } else { + return 0; + } + } + + @Override public int numDocs() { // Don't call ensureOpen() here (it could affect performance) int n = maxDoc(); @@ -1078,17 +1110,13 @@ } } - boolean termsIndexLoaded() { - return core.termsIndexIsLoaded(); - } - // NOTE: only called from IndexWriter when a near // real-time reader is opened, or applyDeletes is run, // sharing a segment that's still being merged. This // method is not thread safe, and relies on the // synchronization in IndexWriter - void loadTermsIndex(int termsIndexDivisor) throws IOException { - core.loadTermsIndex(si, termsIndexDivisor); + void loadTermsIndex(int indexDivisor) throws IOException { + core.fields.loadTermsIndex(indexDivisor); } // for testing only @@ -1266,14 +1294,9 @@ // same entry in the FieldCache. See LUCENE-1579. @Override public final Object getFieldCacheKey() { - return core.freqStream; + return core; } - - @Override - public long getUniqueTermCount() { - return core.getTermsReader().size(); - } - + /** * Lotsa tests did hacks like:
    * SegmentReader reader = (SegmentReader) IndexReader.open(dir);
    @@ -1283,7 +1306,7 @@ */ @Deprecated static SegmentReader getOnlySegmentReader(Directory dir) throws IOException { - return getOnlySegmentReader(IndexReader.open(dir,false)); + return getOnlySegmentReader(IndexReader.open(dir, false)); } static SegmentReader getOnlySegmentReader(IndexReader reader) { @@ -1305,4 +1328,372 @@ public int getTermInfosIndexDivisor() { return core.termsIndexDivisor; } + + // Back compat: pre-flex TermEnum API over flex API + @Deprecated + final private class LegacyTermEnum extends TermEnum { + FieldsEnum fields; + TermsEnum terms; + boolean done; + String currentField; + BytesRef currentTerm; + + public LegacyTermEnum(Term t) throws IOException { + fields = core.fields.iterator(); + currentField = fields.next(); + if (currentField == null) { + // no fields + done = true; + } else if (t != null) { + // Pre-seek to this term + + while(currentField.compareTo(t.field) < 0) { + currentField = fields.next(); + if (currentField == null) { + // Hit end of fields + done = true; + break; + } + } + + if (!done) { + // We found some field -- get its terms: + terms = fields.terms(); + + if (currentField == t.field) { + // We found exactly the requested field; now + // seek the term text: + String text = t.text(); + + // this is only for backwards compatibility. + // previously you could supply a term with unpaired surrogates, + // and it would return the next Term. + // if someone does this, tack on the lowest possible trail surrogate. + // this emulates the old behavior, and forms "valid UTF-8" unicode. + BytesRef tr = new BytesRef(UnicodeUtil.nextValidUTF16String(text)); + TermsEnum.SeekStatus status = terms.seek(tr); + + if (status == TermsEnum.SeekStatus.END) { + // Rollover to the next field + terms = null; + next(); + } else if (status == TermsEnum.SeekStatus.FOUND) { + // Found exactly the term + currentTerm = tr; + } else { + // Found another term, in this same field + currentTerm = terms.term(); + } + } else { + // We didn't find exact field (we found the + // following field); advance to first term in + // this field + next(); + } + } + } else { + terms = fields.terms(); + } + } + + @Override + public boolean next() throws IOException { + + if (done) { + return false; + } + + while(true) { + if (terms == null) { + // Advance to the next field + currentField = fields.next(); + if (currentField == null) { + done = true; + return false; + } + terms = fields.terms(); + } + currentTerm = terms.next(); + if (currentTerm != null) { + // This field still has terms + return true; + } else { + // Done producing terms from this field; advance + // to next field + terms = null; + } + } + } + + @Override + public Term term() { + if (!done && terms != null && currentTerm != null) { + return new Term(currentField, currentTerm.utf8ToString()); + } + return null; + } + + @Override + public int docFreq() { + return terms == null ? 0 : terms.docFreq(); + } + + @Override + public void close() {} + } + + // Back compat: emulates legacy TermDocs API on top of + // flex API + private class LegacyTermDocs implements TermDocs { + + String currentField; + final Fields fields; + TermsEnum terms; + DocsEnum docsEnum; + boolean any; + + LegacyTermDocs() throws IOException { + fields = core.fields; + } + + public void close() {} + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (!any) { + return false; + } else { + return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS; + } + } + + public void seek(Term term) throws IOException { + + any = false; + + if (terms != null && !term.field.equals(currentField)) { + // new field + terms = null; + } + + if (terms == null) { + currentField = term.field; + Terms terms1 = fields.terms(currentField); + if (terms1 == null) { + // no such field + return; + } else { + terms = terms1.iterator(); + } + } + + if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) { + // Term exists + any = true; + pendingBulkResult = null; + docsEnum = terms.docs(deletedDocs, docsEnum); + } + } + + public int doc() { + if (!any) { + return 0; + } else { + return docsEnum.docID(); + } + } + + private DocsEnum.BulkReadResult pendingBulkResult; + private int bulkCount; + private int pendingBulk; + + public int read(int[] docs, int[] freqs) throws IOException { + if (any && pendingBulkResult == null) { + pendingBulkResult = docsEnum.getBulkResult(); + } + if (!any) { + return 0; + } else if (pendingBulk > 0) { + final int left = bulkCount - pendingBulk; + if (docs.length >= left) { + // read all pending + System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, left); + System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, left); + pendingBulk = 0; + return left; + } else { + // read only part of pending + System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, docs.length); + System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, docs.length); + pendingBulk += docs.length; + return docs.length; + } + } else { + // nothing pending + bulkCount = docsEnum.read(); + if (docs.length >= bulkCount) { + System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, bulkCount); + System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, bulkCount); + return bulkCount; + } else { + System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, docs.length); + System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, docs.length); + pendingBulk = docs.length; + return docs.length; + } + } + } + + public int freq() { + if (!any) { + return 0; + } else { + return docsEnum.freq(); + } + } + + public boolean next() throws IOException { + if (!any) { + return false; + } else { + return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS; + } + } + } + + // Back compat: implements legacy TermPositions API on top + // of flex API + final private class LegacyTermPositions implements TermPositions { + + String currentField; + final Fields fields; + TermsEnum terms; + DocsAndPositionsEnum postingsEnum; + DocsEnum docsEnum; + boolean any; + + LegacyTermPositions() throws IOException { + fields = core.fields; + } + + public void close() {} + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (!any) { + return false; + } else { + return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS; + } + } + + public void seek(Term term) throws IOException { + + any = false; + + if (terms != null && !term.field.equals(currentField)) { + // new field + terms = null; + } + + if (terms == null) { + currentField = term.field; + Terms terms1 = fields.terms(currentField); + if (terms1 == null) { + // no such field + return; + } else { + terms = terms1.iterator(); + } + } + + if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) { + // Term exists + any = true; + postingsEnum = terms.docsAndPositions(deletedDocs, postingsEnum); + if (postingsEnum == null) { + docsEnum = terms.docs(deletedDocs, postingsEnum); + } else { + docsEnum = postingsEnum; + } + } + } + + public int doc() { + if (!any) { + return 0; + } else { + return docsEnum.docID(); + } + } + + public int freq() { + if (!any) { + return 0; + } else { + return docsEnum.freq(); + } + } + + public boolean next() throws IOException { + if (!any) { + return false; + } else { + return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS; + } + } + + public int read(int[] docs, int[] freqs) throws IOException { + throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); + } + + public int nextPosition() throws IOException { + if (!any || postingsEnum == null) { + return 0; + } else { + return postingsEnum.nextPosition(); + } + } + + public int getPayloadLength() { + if (!any || postingsEnum == null) { + return 0; + } else { + return postingsEnum.getPayloadLength(); + } + } + + public byte[] getPayload(byte[] bytes, int offset) throws IOException { + if (!any || postingsEnum == null) { + return null; + } + final BytesRef payload = postingsEnum.getPayload(); + // old API would always used passed in bytes if it + // "fits", else allocate new: + if (bytes != null && payload.length <= bytes.length - offset) { + System.arraycopy(payload.bytes, payload.offset, bytes, offset, payload.length); + return bytes; + } else if (payload.offset == 0 && payload.length == payload.bytes.length) { + return payload.bytes; + } else { + final byte[] retBytes = new byte[payload.length]; + System.arraycopy(payload.bytes, payload.offset, retBytes, 0, payload.length); + return retBytes; + } + } + + public boolean isPayloadAvailable() { + if (!any || postingsEnum == null) { + return false; + } else { + return postingsEnum.hasPayload(); + } + } + } } Index: src/java/org/apache/lucene/index/SegmentTermDocs.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermDocs.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentTermDocs.java (working copy) @@ -1,212 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import org.apache.lucene.util.BitVector; -import org.apache.lucene.store.IndexInput; - -class SegmentTermDocs implements TermDocs { - protected SegmentReader parent; - protected IndexInput freqStream; - protected int count; - protected int df; - protected BitVector deletedDocs; - int doc = 0; - int freq; - - private int skipInterval; - private int maxSkipLevels; - private DefaultSkipListReader skipListReader; - - private long freqBasePointer; - private long proxBasePointer; - - private long skipPointer; - private boolean haveSkipped; - - protected boolean currentFieldStoresPayloads; - protected boolean currentFieldOmitTermFreqAndPositions; - - protected SegmentTermDocs(SegmentReader parent) { - this.parent = parent; - this.freqStream = (IndexInput) parent.core.freqStream.clone(); - synchronized (parent) { - this.deletedDocs = parent.deletedDocs; - } - this.skipInterval = parent.core.getTermsReader().getSkipInterval(); - this.maxSkipLevels = parent.core.getTermsReader().getMaxSkipLevels(); - } - - public void seek(Term term) throws IOException { - TermInfo ti = parent.core.getTermsReader().get(term); - seek(ti, term); - } - - public void seek(TermEnum termEnum) throws IOException { - TermInfo ti; - Term term; - - // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs - if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.core.fieldInfos) { // optimized case - SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum); - term = segmentTermEnum.term(); - ti = segmentTermEnum.termInfo(); - } else { // punt case - term = termEnum.term(); - ti = parent.core.getTermsReader().get(term); - } - - seek(ti, term); - } - - void seek(TermInfo ti, Term term) throws IOException { - count = 0; - FieldInfo fi = parent.core.fieldInfos.fieldInfo(term.field); - currentFieldOmitTermFreqAndPositions = (fi != null) ? fi.omitTermFreqAndPositions : false; - currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; - if (ti == null) { - df = 0; - } else { - df = ti.docFreq; - doc = 0; - freqBasePointer = ti.freqPointer; - proxBasePointer = ti.proxPointer; - skipPointer = freqBasePointer + ti.skipOffset; - freqStream.seek(freqBasePointer); - haveSkipped = false; - } - } - - public void close() throws IOException { - freqStream.close(); - if (skipListReader != null) - skipListReader.close(); - } - - public final int doc() { return doc; } - public final int freq() { return freq; } - - protected void skippingDoc() throws IOException { - } - - public boolean next() throws IOException { - while (true) { - if (count == df) - return false; - final int docCode = freqStream.readVInt(); - - if (currentFieldOmitTermFreqAndPositions) { - doc += docCode; - freq = 1; - } else { - doc += docCode >>> 1; // shift off low bit - if ((docCode & 1) != 0) // if low bit is set - freq = 1; // freq is one - else - freq = freqStream.readVInt(); // else read freq - } - - count++; - - if (deletedDocs == null || !deletedDocs.get(doc)) - break; - skippingDoc(); - } - return true; - } - - /** Optimized implementation. */ - public int read(final int[] docs, final int[] freqs) - throws IOException { - final int length = docs.length; - if (currentFieldOmitTermFreqAndPositions) { - return readNoTf(docs, freqs, length); - } else { - int i = 0; - while (i < length && count < df) { - // manually inlined call to next() for speed - final int docCode = freqStream.readVInt(); - doc += docCode >>> 1; // shift off low bit - if ((docCode & 1) != 0) // if low bit is set - freq = 1; // freq is one - else - freq = freqStream.readVInt(); // else read freq - count++; - - if (deletedDocs == null || !deletedDocs.get(doc)) { - docs[i] = doc; - freqs[i] = freq; - ++i; - } - } - return i; - } - } - - private final int readNoTf(final int[] docs, final int[] freqs, final int length) throws IOException { - int i = 0; - while (i < length && count < df) { - // manually inlined call to next() for speed - doc += freqStream.readVInt(); - count++; - - if (deletedDocs == null || !deletedDocs.get(doc)) { - docs[i] = doc; - // Hardware freq to 1 when term freqs were not - // stored in the index - freqs[i] = 1; - ++i; - } - } - return i; - } - - - /** Overridden by SegmentTermPositions to skip in prox stream. */ - protected void skipProx(long proxPointer, int payloadLength) throws IOException {} - - /** Optimized implementation. */ - public boolean skipTo(int target) throws IOException { - if (df >= skipInterval) { // optimized case - if (skipListReader == null) - skipListReader = new DefaultSkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone - - if (!haveSkipped) { // lazily initialize skip stream - skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads); - haveSkipped = true; - } - - int newCount = skipListReader.skipTo(target); - if (newCount > count) { - freqStream.seek(skipListReader.getFreqPointer()); - skipProx(skipListReader.getProxPointer(), skipListReader.getPayloadLength()); - - doc = skipListReader.getDoc(); - count = newCount; - } - } - - // done skipping, now just scan - do { - if (!next()) - return false; - } while (target > doc); - return true; - } -} Index: src/java/org/apache/lucene/index/SegmentTermEnum.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentTermEnum.java (working copy) @@ -1,216 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import org.apache.lucene.store.IndexInput; - -final class SegmentTermEnum extends TermEnum implements Cloneable { - private IndexInput input; - FieldInfos fieldInfos; - long size; - long position = -1; - - private TermBuffer termBuffer = new TermBuffer(); - private TermBuffer prevBuffer = new TermBuffer(); - private TermBuffer scanBuffer = new TermBuffer(); // used for scanning - - private TermInfo termInfo = new TermInfo(); - - private int format; - private boolean isIndex = false; - long indexPointer = 0; - int indexInterval; - int skipInterval; - int maxSkipLevels; - private int formatM1SkipInterval; - - SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) - throws CorruptIndexException, IOException { - input = i; - fieldInfos = fis; - isIndex = isi; - maxSkipLevels = 1; // use single-level skip lists for formats > -3 - - int firstInt = input.readInt(); - if (firstInt >= 0) { - // original-format file, without explicit format version number - format = 0; - size = firstInt; - - // back-compatible settings - indexInterval = 128; - skipInterval = Integer.MAX_VALUE; // switch off skipTo optimization - } else { - // we have a format version number - format = firstInt; - - // check that it is a format we can understand - if (format < TermInfosWriter.FORMAT_CURRENT) - throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher"); - - size = input.readLong(); // read the size - - if(format == -1){ - if (!isIndex) { - indexInterval = input.readInt(); - formatM1SkipInterval = input.readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = Integer.MAX_VALUE; - } else { - indexInterval = input.readInt(); - skipInterval = input.readInt(); - if (format <= TermInfosWriter.FORMAT) { - // this new format introduces multi-level skipping - maxSkipLevels = input.readInt(); - } - } - assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; - assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; - } - if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { - termBuffer.setPreUTF8Strings(); - scanBuffer.setPreUTF8Strings(); - prevBuffer.setPreUTF8Strings(); - } - } - - @Override - protected Object clone() { - SegmentTermEnum clone = null; - try { - clone = (SegmentTermEnum) super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.input = (IndexInput) input.clone(); - clone.termInfo = new TermInfo(termInfo); - - clone.termBuffer = (TermBuffer)termBuffer.clone(); - clone.prevBuffer = (TermBuffer)prevBuffer.clone(); - clone.scanBuffer = new TermBuffer(); - - return clone; - } - - final void seek(long pointer, long p, Term t, TermInfo ti) - throws IOException { - input.seek(pointer); - position = p; - termBuffer.set(t); - prevBuffer.reset(); - termInfo.set(ti); - } - - /** Increments the enumeration to the next element. True if one exists.*/ - @Override - public final boolean next() throws IOException { - if (position++ >= size - 1) { - prevBuffer.set(termBuffer); - termBuffer.reset(); - return false; - } - - prevBuffer.set(termBuffer); - termBuffer.read(input, fieldInfos); - - termInfo.docFreq = input.readVInt(); // read doc freq - termInfo.freqPointer += input.readVLong(); // read freq pointer - termInfo.proxPointer += input.readVLong(); // read prox pointer - - if(format == -1){ - // just read skipOffset in order to increment file pointer; - // value is never used since skipTo is switched off - if (!isIndex) { - if (termInfo.docFreq > formatM1SkipInterval) { - termInfo.skipOffset = input.readVInt(); - } - } - } - else{ - if (termInfo.docFreq >= skipInterval) - termInfo.skipOffset = input.readVInt(); - } - - if (isIndex) - indexPointer += input.readVLong(); // read index pointer - - return true; - } - - /** Optimized scan, without allocating new terms. - * Return number of invocations to next(). */ - final int scanTo(Term term) throws IOException { - scanBuffer.set(term); - int count = 0; - while (scanBuffer.compareTo(termBuffer) > 0 && next()) { - count++; - } - return count; - } - - /** Returns the current Term in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - @Override - public final Term term() { - return termBuffer.toTerm(); - } - - /** Returns the previous Term enumerated. Initially null.*/ - final Term prev() { - return prevBuffer.toTerm(); - } - - /** Returns the current TermInfo in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - final TermInfo termInfo() { - return new TermInfo(termInfo); - } - - /** Sets the argument to the current TermInfo in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - final void termInfo(TermInfo ti) { - ti.set(termInfo); - } - - /** Returns the docFreq from the current TermInfo in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - @Override - public final int docFreq() { - return termInfo.docFreq; - } - - /* Returns the freqPointer from the current TermInfo in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - final long freqPointer() { - return termInfo.freqPointer; - } - - /* Returns the proxPointer from the current TermInfo in the enumeration. - Initially invalid, valid after next() called for the first time.*/ - final long proxPointer() { - return termInfo.proxPointer; - } - - /** Closes the enumeration to further activity, freeing resources. */ - @Override - public final void close() throws IOException { - input.close(); - } -} Index: src/java/org/apache/lucene/index/SegmentTermPositions.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermPositions.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentTermPositions.java (working copy) @@ -1,203 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexInput; - -import java.io.IOException; - -final class SegmentTermPositions -extends SegmentTermDocs implements TermPositions { - private IndexInput proxStream; - private int proxCount; - private int position; - - // the current payload length - private int payloadLength; - // indicates whether the payload of the current position has - // been read from the proxStream yet - private boolean needToLoadPayload; - - // these variables are being used to remember information - // for a lazy skip - private long lazySkipPointer = -1; - private int lazySkipProxCount = 0; - - SegmentTermPositions(SegmentReader p) { - super(p); - this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time - } - - @Override - final void seek(TermInfo ti, Term term) throws IOException { - super.seek(ti, term); - if (ti != null) - lazySkipPointer = ti.proxPointer; - - lazySkipProxCount = 0; - proxCount = 0; - payloadLength = 0; - needToLoadPayload = false; - } - - @Override - public final void close() throws IOException { - super.close(); - if (proxStream != null) proxStream.close(); - } - - public final int nextPosition() throws IOException { - if (currentFieldOmitTermFreqAndPositions) - // This field does not store term freq, positions, payloads - return 0; - // perform lazy skips if necessary - lazySkip(); - proxCount--; - return position += readDeltaPosition(); - } - - private final int readDeltaPosition() throws IOException { - int delta = proxStream.readVInt(); - if (currentFieldStoresPayloads) { - // if the current field stores payloads then - // the position delta is shifted one bit to the left. - // if the LSB is set, then we have to read the current - // payload length - if ((delta & 1) != 0) { - payloadLength = proxStream.readVInt(); - } - delta >>>= 1; - needToLoadPayload = true; - } - return delta; - } - - @Override - protected final void skippingDoc() throws IOException { - // we remember to skip a document lazily - lazySkipProxCount += freq; - } - - @Override - public final boolean next() throws IOException { - // we remember to skip the remaining positions of the current - // document lazily - lazySkipProxCount += proxCount; - - if (super.next()) { // run super - proxCount = freq; // note frequency - position = 0; // reset position - return true; - } - return false; - } - - @Override - public final int read(final int[] docs, final int[] freqs) { - throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); - } - - - /** Called by super.skipTo(). */ - @Override - protected void skipProx(long proxPointer, int payloadLength) throws IOException { - // we save the pointer, we might have to skip there lazily - lazySkipPointer = proxPointer; - lazySkipProxCount = 0; - proxCount = 0; - this.payloadLength = payloadLength; - needToLoadPayload = false; - } - - private void skipPositions(int n) throws IOException { - assert !currentFieldOmitTermFreqAndPositions; - for (int f = n; f > 0; f--) { // skip unread positions - readDeltaPosition(); - skipPayload(); - } - } - - private void skipPayload() throws IOException { - if (needToLoadPayload && payloadLength > 0) { - proxStream.seek(proxStream.getFilePointer() + payloadLength); - } - needToLoadPayload = false; - } - - // It is not always necessary to move the prox pointer - // to a new document after the freq pointer has been moved. - // Consider for example a phrase query with two terms: - // the freq pointer for term 1 has to move to document x - // to answer the question if the term occurs in that document. But - // only if term 2 also matches document x, the positions have to be - // read to figure out if term 1 and term 2 appear next - // to each other in document x and thus satisfy the query. - // So we move the prox pointer lazily to the document - // as soon as positions are requested. - private void lazySkip() throws IOException { - if (proxStream == null) { - // clone lazily - proxStream = (IndexInput) parent.core.proxStream.clone(); - } - - // we might have to skip the current payload - // if it was not read yet - skipPayload(); - - if (lazySkipPointer != -1) { - proxStream.seek(lazySkipPointer); - lazySkipPointer = -1; - } - - if (lazySkipProxCount != 0) { - skipPositions(lazySkipProxCount); - lazySkipProxCount = 0; - } - } - - public int getPayloadLength() { - return payloadLength; - } - - public byte[] getPayload(byte[] data, int offset) throws IOException { - if (!needToLoadPayload) { - throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); - } - - // read payloads lazily - byte[] retArray; - int retOffset; - if (data == null || data.length - offset < payloadLength) { - // the array is too small to store the payload data, - // so we allocate a new one - retArray = new byte[payloadLength]; - retOffset = 0; - } else { - retArray = data; - retOffset = offset; - } - proxStream.readBytes(retArray, retOffset, payloadLength); - needToLoadPayload = false; - return retArray; - } - - public boolean isPayloadAvailable() { - return needToLoadPayload && payloadLength > 0; - } - -} Index: src/java/org/apache/lucene/index/SegmentWriteState.java =================================================================== --- src/java/org/apache/lucene/index/SegmentWriteState.java (revision 931099) +++ src/java/org/apache/lucene/index/SegmentWriteState.java (working copy) @@ -19,32 +19,63 @@ import java.util.HashSet; import java.util.Collection; +import java.io.PrintStream; import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; -class SegmentWriteState { - DocumentsWriter docWriter; - Directory directory; - String segmentName; - String docStoreSegmentName; - int numDocs; - int termIndexInterval; - int numDocsInStore; - Collection flushedFiles; +/** + * This class is not meant for public usage; it's only + * public in order to expose access across packages. It's + * used internally when updating the index. + * @lucene.experimental + */ +public class SegmentWriteState { + public final PrintStream infoStream; + public final Directory directory; + public final String segmentName; + public final FieldInfos fieldInfos; + public final String docStoreSegmentName; + public final int numDocs; + public int numDocsInStore; + public final Collection flushedFiles; - public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs, - int numDocsInStore, int termIndexInterval) { - this.docWriter = docWriter; + // Actual codec used + final Codec codec; + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + public final int termIndexInterval; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + public final int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + public final int maxSkipLevels = 10; + + public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos, + String docStoreSegmentName, int numDocs, + int numDocsInStore, int termIndexInterval, + CodecProvider codecs) { + this.infoStream = infoStream; this.directory = directory; this.segmentName = segmentName; + this.fieldInfos = fieldInfos; this.docStoreSegmentName = docStoreSegmentName; this.numDocs = numDocs; this.numDocsInStore = numDocsInStore; this.termIndexInterval = termIndexInterval; + this.codec = codecs.getWriter(this); flushedFiles = new HashSet(); } - - public String segmentFileName(String ext) { - return segmentName + "." + ext; - } } Index: src/java/org/apache/lucene/index/StoredFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/StoredFieldsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/StoredFieldsWriter.java (working copy) @@ -90,8 +90,8 @@ state.flushedFiles.add(fieldsName); state.flushedFiles.add(fieldsIdxName); - state.docWriter.removeOpenFile(fieldsName); - state.docWriter.removeOpenFile(fieldsIdxName); + docWriter.removeOpenFile(fieldsName); + docWriter.removeOpenFile(fieldsIdxName); if (4+((long) state.numDocsInStore)*8 != state.directory.fileLength(fieldsIdxName)) throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fieldsIdxName) + " length in bytes of " + fieldsIdxName + " file exists?=" + state.directory.fileExists(fieldsIdxName)); Index: src/java/org/apache/lucene/index/Term.java =================================================================== --- src/java/org/apache/lucene/index/Term.java (revision 931099) +++ src/java/org/apache/lucene/index/Term.java (working copy) @@ -1,7 +1,5 @@ package org.apache.lucene.index; -import org.apache.lucene.util.StringHelper; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ * limitations under the License. */ +import org.apache.lucene.util.StringHelper; + /** A Term represents a word from text. This is the unit of search. It is composed of two elements, the text of the word, as a string, and the name of @@ -35,7 +35,7 @@ *

    Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ public Term(String fld, String txt) { - field = StringHelper.intern(fld); + field = fld == null ? null : StringHelper.intern(fld); text = txt; } @@ -49,7 +49,8 @@ this(fld, "", true); } - Term(String fld, String txt, boolean intern) { + /** @lucene.experimental */ + public Term(String fld, String txt, boolean intern) { field = intern ? StringHelper.intern(fld) : fld; // field names are interned text = txt; // unless already known to be } Index: src/java/org/apache/lucene/index/TermBuffer.java =================================================================== --- src/java/org/apache/lucene/index/TermBuffer.java (revision 931099) +++ src/java/org/apache/lucene/index/TermBuffer.java (working copy) @@ -1,140 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.UnicodeUtil; - -final class TermBuffer implements Cloneable { - - private String field; - private Term term; // cached - private boolean preUTF8Strings; // true if strings are stored in modified UTF8 encoding (LUCENE-510) - private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) - - private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF8Result bytes = new UnicodeUtil.UTF8Result(); - - public final int compareTo(TermBuffer other) { - if (field == other.field) // fields are interned - return compareChars(text.result, text.length, other.text.result, other.text.length); - else - return field.compareTo(other.field); - } - - private static final int compareChars(char[] chars1, int len1, - char[] chars2, int len2) { - final int end = len1 < len2 ? len1:len2; - for (int k = 0; k < end; k++) { - char c1 = chars1[k]; - char c2 = chars2[k]; - if (c1 != c2) { - return c1 - c2; - } - } - return len1 - len2; - } - - /** Call this if the IndexInput passed to {@link #read} - * stores terms in the "modified UTF8" (pre LUCENE-510) - * format. */ - void setPreUTF8Strings() { - preUTF8Strings = true; - } - - public final void read(IndexInput input, FieldInfos fieldInfos) - throws IOException { - this.term = null; // invalidate cache - int start = input.readVInt(); - int length = input.readVInt(); - int totalLength = start + length; - if (preUTF8Strings) { - text.setLength(totalLength); - input.readChars(text.result, start, length); - } else { - - if (dirty) { - // Fully convert all bytes since bytes is dirty - UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - bytes.setLength(totalLength); - input.readBytes(bytes.result, start, length); - UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); - dirty = false; - } else { - // Incrementally convert only the UTF8 bytes that are new: - bytes.setLength(totalLength); - input.readBytes(bytes.result, start, length); - UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); - } - } - this.field = fieldInfos.fieldName(input.readVInt()); - } - - public final void set(Term term) { - if (term == null) { - reset(); - return; - } - final String termText = term.text(); - final int termLen = termText.length(); - text.setLength(termLen); - termText.getChars(0, termLen, text.result, 0); - dirty = true; - field = term.field(); - this.term = term; - } - - public final void set(TermBuffer other) { - text.copyText(other.text); - dirty = true; - field = other.field; - term = other.term; - } - - public void reset() { - field = null; - text.setLength(0); - term = null; - dirty = true; - } - - public Term toTerm() { - if (field == null) // unset - return null; - - if (term == null) - term = new Term(field, new String(text.result, 0, text.length), false); - - return term; - } - - @Override - protected Object clone() { - TermBuffer clone = null; - try { - clone = (TermBuffer)super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.dirty = true; - clone.bytes = new UnicodeUtil.UTF8Result(); - clone.text = new UnicodeUtil.UTF16Result(); - clone.text.copyText(text); - return clone; - } -} Index: src/java/org/apache/lucene/index/TermDocs.java =================================================================== --- src/java/org/apache/lucene/index/TermDocs.java (revision 931099) +++ src/java/org/apache/lucene/index/TermDocs.java (working copy) @@ -27,8 +27,10 @@ ordered by document number. @see IndexReader#termDocs() - */ + @deprecated Use {@link DocsEnum} instead +*/ +@Deprecated public interface TermDocs extends Closeable { /** Sets this to the data for a term. * The enumeration is reset to the start of the data for this term. Index: src/java/org/apache/lucene/index/TermEnum.java =================================================================== --- src/java/org/apache/lucene/index/TermEnum.java (revision 931099) +++ src/java/org/apache/lucene/index/TermEnum.java (working copy) @@ -23,8 +23,10 @@ /** Abstract class for enumerating terms.

    Term enumerations are always ordered by Term.compareTo(). Each term in - the enumeration is greater than all that precede it. */ + the enumeration is greater than all that precede it. +* @deprecated Use TermsEnum instead */ +@Deprecated public abstract class TermEnum implements Closeable { /** Increments the enumeration to the next element. True if one exists.*/ public abstract boolean next() throws IOException; Index: src/java/org/apache/lucene/index/TermInfo.java =================================================================== --- src/java/org/apache/lucene/index/TermInfo.java (revision 931099) +++ src/java/org/apache/lucene/index/TermInfo.java (working copy) @@ -1,59 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** A TermInfo is the record of information stored for a term.*/ - -class TermInfo { - /** The number of documents which contain the term. */ - int docFreq = 0; - - long freqPointer = 0; - long proxPointer = 0; - int skipOffset; - - TermInfo() {} - - TermInfo(int df, long fp, long pp) { - docFreq = df; - freqPointer = fp; - proxPointer = pp; - } - - TermInfo(TermInfo ti) { - docFreq = ti.docFreq; - freqPointer = ti.freqPointer; - proxPointer = ti.proxPointer; - skipOffset = ti.skipOffset; - } - - final void set(int docFreq, - long freqPointer, long proxPointer, int skipOffset) { - this.docFreq = docFreq; - this.freqPointer = freqPointer; - this.proxPointer = proxPointer; - this.skipOffset = skipOffset; - } - - final void set(TermInfo ti) { - docFreq = ti.docFreq; - freqPointer = ti.freqPointer; - proxPointer = ti.proxPointer; - skipOffset = ti.skipOffset; - } -} Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 931099) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -1,317 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.cache.Cache; -import org.apache.lucene.util.cache.DoubleBarrelLRUCache; -import org.apache.lucene.util.CloseableThreadLocal; - -/** This stores a monotonically increasing set of pairs in a - * Directory. Pairs are accessed either by Term or by ordinal position the - * set. */ - -final class TermInfosReader { - private final Directory directory; - private final String segment; - private final FieldInfos fieldInfos; - - private final CloseableThreadLocal threadResources = new CloseableThreadLocal(); - private final SegmentTermEnum origEnum; - private final long size; - - private final Term[] indexTerms; - private final TermInfo[] indexInfos; - private final long[] indexPointers; - - private final int totalIndexInterval; - - private final static int DEFAULT_CACHE_SIZE = 1024; - - // Just adds term's ord to TermInfo - private final static class TermInfoAndOrd extends TermInfo { - final int termOrd; - public TermInfoAndOrd(TermInfo ti, int termOrd) { - super(ti); - this.termOrd = termOrd; - } - } - - private final Cache termsCache = new DoubleBarrelLRUCache(DEFAULT_CACHE_SIZE); - - /** - * Per-thread resources managed by ThreadLocal - */ - private static final class ThreadResources { - SegmentTermEnum termEnum; - } - - TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor) - throws CorruptIndexException, IOException { - boolean success = false; - - if (indexDivisor < 1 && indexDivisor != -1) { - throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); - } - - try { - directory = dir; - segment = seg; - fieldInfos = fis; - - origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), - readBufferSize), fieldInfos, false); - size = origEnum.size; - - - if (indexDivisor != -1) { - // Load terms index - totalIndexInterval = origEnum.indexInterval * indexDivisor; - final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), - readBufferSize), fieldInfos, true); - - try { - int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index - - indexTerms = new Term[indexSize]; - indexInfos = new TermInfo[indexSize]; - indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { - indexTerms[i] = indexEnum.term(); - indexInfos[i] = indexEnum.termInfo(); - indexPointers[i] = indexEnum.indexPointer; - - for (int j = 1; j < indexDivisor; j++) - if (!indexEnum.next()) - break; - } - } finally { - indexEnum.close(); - } - } else { - // Do not load terms index: - totalIndexInterval = -1; - indexTerms = null; - indexInfos = null; - indexPointers = null; - } - success = true; - } finally { - // With lock-less commits, it's entirely possible (and - // fine) to hit a FileNotFound exception above. In - // this case, we want to explicitly close any subset - // of things that were opened so that we don't have to - // wait for a GC to do so. - if (!success) { - close(); - } - } - } - - public int getSkipInterval() { - return origEnum.skipInterval; - } - - public int getMaxSkipLevels() { - return origEnum.maxSkipLevels; - } - - final void close() throws IOException { - if (origEnum != null) - origEnum.close(); - threadResources.close(); - termsCache.close(); - } - - /** Returns the number of term/value pairs in the set. */ - final long size() { - return size; - } - - private ThreadResources getThreadResources() { - ThreadResources resources = threadResources.get(); - if (resources == null) { - resources = new ThreadResources(); - resources.termEnum = terms(); - threadResources.set(resources); - } - return resources; - } - - - /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { - int lo = 0; // binary search indexTerms[] - int hi = indexTerms.length - 1; - - while (hi >= lo) { - int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); - if (delta < 0) - hi = mid - 1; - else if (delta > 0) - lo = mid + 1; - else - return mid; - } - return hi; - } - - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { - enumerator.seek(indexPointers[indexOffset], - ((long) indexOffset * totalIndexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); - } - - /** Returns the TermInfo for a Term in the set, or null. */ - TermInfo get(Term term) throws IOException { - return get(term, false); - } - - /** Returns the TermInfo for a Term in the set, or null. */ - private TermInfo get(Term term, boolean mustSeekEnum) throws IOException { - if (size == 0) return null; - - ensureIndexIsRead(); - - TermInfoAndOrd tiOrd = termsCache.get(term); - ThreadResources resources = getThreadResources(); - - if (!mustSeekEnum && tiOrd != null) { - return tiOrd; - } - - // optimize sequential access: first try scanning cached enum w/o seeking - SegmentTermEnum enumerator = resources.termEnum; - if (enumerator.term() != null // term is at or past current - && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) - || term.compareTo(enumerator.term()) >= 0)) { - int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; - if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { - // no need to seek - - final TermInfo ti; - - int numScans = enumerator.scanTo(term); - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { - ti = enumerator.termInfo(); - if (numScans > 1) { - // we only want to put this TermInfo into the cache if - // scanEnum skipped more than one dictionary entry. - // This prevents RangeQueries or WildcardQueries to - // wipe out the cache when they iterate over a large numbers - // of terms in order - if (tiOrd == null) { - termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position)); - } else { - assert sameTermInfo(ti, tiOrd, enumerator); - assert (int) enumerator.position == tiOrd.termOrd; - } - } - } else { - ti = null; - } - - return ti; - } - } - - // random-access: must seek - final int indexPos; - if (tiOrd != null) { - indexPos = tiOrd.termOrd / totalIndexInterval; - } else { - // Must do binary search: - indexPos = getIndexOffset(term); - } - - seekEnum(enumerator, indexPos); - enumerator.scanTo(term); - final TermInfo ti; - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { - ti = enumerator.termInfo(); - if (tiOrd == null) { - termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position)); - } else { - assert sameTermInfo(ti, tiOrd, enumerator); - assert (int) enumerator.position == tiOrd.termOrd; - } - } else { - ti = null; - } - return ti; - } - - // called only from asserts - private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { - if (ti1.docFreq != ti2.docFreq) { - return false; - } - if (ti1.freqPointer != ti2.freqPointer) { - return false; - } - if (ti1.proxPointer != ti2.proxPointer) { - return false; - } - // skipOffset is only valid when docFreq >= skipInterval: - if (ti1.docFreq >= enumerator.skipInterval && - ti1.skipOffset != ti2.skipOffset) { - return false; - } - return true; - } - - private void ensureIndexIsRead() { - if (indexTerms == null) { - throw new IllegalStateException("terms index was not loaded when this reader was created"); - } - } - - /** Returns the position of a Term in the set or -1. */ - final long getPosition(Term term) throws IOException { - if (size == 0) return -1; - - ensureIndexIsRead(); - int indexOffset = getIndexOffset(term); - - SegmentTermEnum enumerator = getThreadResources().termEnum; - seekEnum(enumerator, indexOffset); - - while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} - - if (term.compareTo(enumerator.term()) == 0) - return enumerator.position; - else - return -1; - } - - /** Returns an enumeration of all the Terms and TermInfos in the set. */ - public SegmentTermEnum terms() { - return (SegmentTermEnum)origEnum.clone(); - } - - /** Returns an enumeration of terms starting at or after the named term. */ - public SegmentTermEnum terms(Term term) throws IOException { - get(term, true); - return (SegmentTermEnum)getThreadResources().termEnum.clone(); - } -} Index: src/java/org/apache/lucene/index/TermInfosWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy) @@ -1,228 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.IOException; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; - - -/** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - -final class TermInfosWriter { - /** The file format version, a negative number. */ - public static final int FORMAT = -3; - - // Changed strings to true utf8 with length-in-bytes not - // length-in-chars - public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; - - // NOTE: always change this if you switch to a new format! - public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); - add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - -} Index: src/java/org/apache/lucene/index/TermPositions.java =================================================================== --- src/java/org/apache/lucene/index/TermPositions.java (revision 931099) +++ src/java/org/apache/lucene/index/TermPositions.java (working copy) @@ -26,8 +26,9 @@ * positions of each occurrence of a term in a document. * * @see IndexReader#termPositions() + * @deprecated Use {@link DocsAndPositionsEnum} instead */ - +@Deprecated public interface TermPositions extends TermDocs { Index: src/java/org/apache/lucene/index/TermsHashConsumerPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (revision 931099) +++ src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (working copy) @@ -34,8 +34,6 @@ abstract void newTerm(int termID) throws IOException; abstract void addTerm(int termID) throws IOException; abstract int getStreamCount(); - - abstract ParallelPostingsArray createPostingsArray(int size); - abstract int bytesPerPosting(); + abstract ParallelPostingsArray createPostingsArray(int size); } Index: src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 931099) +++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -19,10 +19,13 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; final class TermsHashPerField extends InvertedDocConsumerPerField { @@ -32,12 +35,12 @@ final TermsHashPerThread perThread; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; - TermAttribute termAtt; - + TermToBytesRefAttribute termAtt; + // Copied from our perThread - final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; + final ByteBlockPool termBytePool; final int streamCount; final int numPostingInt; @@ -52,43 +55,42 @@ private int[] postingsHash; ParallelPostingsArray postingsArray; - - private final int bytesPerPosting; - + private final BytesRef utf8; + private Comparator termComp; + public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; - charPool = perThread.charPool; bytePool = perThread.bytePool; + termBytePool = perThread.termBytePool; docState = perThread.docState; + postingsHash = new int[postingsHashSize]; Arrays.fill(postingsHash, -1); + bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT); + fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); + postingsArray = consumer.createPostingsArray(postingsHashSize/2); + bytesUsed(postingsArray.size * postingsArray.bytesPerPosting()); + streamCount = consumer.getStreamCount(); numPostingInt = 2*streamCount; + utf8 = perThread.utf8; this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); else nextPerField = null; - - // +3: Posting is referenced by hash, which - // targets 25-50% fill factor; approximate this - // as 3X # pointers - bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE; } - - void initPostingsArray() { - assert postingsArray == null; - postingsArray = consumer.createPostingsArray(postingsHashSize); - + // sugar: just forwards to DW + private void bytesUsed(long size) { if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize); + perThread.termsHash.docWriter.bytesUsed(size); } } - + void shrinkHash(int targetSize) { assert postingsCompacted || numPostings == 0; @@ -100,13 +102,20 @@ } if (newSize != postingsHash.length) { + final long previousSize = postingsHash.length; postingsHash = new int[newSize]; + bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT); Arrays.fill(postingsHash, -1); - postingsArray = null; postingsHashSize = newSize; postingsHashHalfSize = newSize/2; postingsHashMask = newSize-1; } + + if (postingsArray != null) { + final int startSize = postingsArray.size; + postingsArray = postingsArray.shrink(targetSize, false); + bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - startSize)); + } } public void reset() { @@ -129,14 +138,10 @@ nextPerField.abort(); } - private void growParallelPostingsArray() { - int oldSize = postingsArray.byteStarts.length; - int newSize = (int) (oldSize * 1.5); - this.postingsArray = this.postingsArray.resize(newSize); - - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize)); - } + private final void growParallelPostingsArray() { + int oldSize = postingsArray.size; + this.postingsArray = this.postingsArray.grow(); + bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)); } public void initReader(ByteSliceReader reader, int termID, int stream) { @@ -166,7 +171,8 @@ } /** Collapse the hash table & sort in-place. */ - public int[] sortPostings() { + public int[] sortPostings(Comparator termComp) { + this.termComp = termComp; compactPostings(); quickSort(postingsHash, 0, numPostings-1); return postingsHash; @@ -237,50 +243,48 @@ * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ int comparePostings(int term1, int term2) { - if (term1 == term2) + if (term1 == term2) { + // Our quicksort does this, eg during partition return 0; + } - final int textStart1 = postingsArray.textStarts[term1]; - final int textStart2 = postingsArray.textStarts[term2]; - - final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK; - final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK; + termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]); + termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]); - assert text1 != text2 || pos1 != pos2; - - while(true) { - final char c1 = text1[pos1++]; - final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else - // This method should never compare equal postings - // unless p1==p2 - assert c1 != 0xffff; - } + return termComp.compare(perThread.tr1, perThread.tr2); } /** Test whether the text for current RawPostingList p equals - * current tokenText. */ - private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) { + * current tokenText in utf8. */ + private boolean postingEquals(final int termID) { final int textStart = postingsArray.textStarts[termID]; - - final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; + final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert text != null; - int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK; - int tokenPos = 0; - for(;tokenPos= postingsArray.textStarts.length) { + if (termID >= postingsArray.size) { growParallelPostingsArray(); } - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesUsed(bytesPerPosting); - } assert termID >= 0; @@ -392,48 +397,15 @@ // We are first in the chain so we must "intern" the // term text into textStart address - // Get the text of this term. - final char[] tokenText = termAtt.termBuffer(); - final int tokenTextLen = termAtt.termLength(); + // Get the text & hash of this term. + int code = termAtt.toBytesRef(utf8); - // Compute hashcode & replace any invalid UTF16 sequences - int downto = tokenTextLen; - int code = 0; - while (downto > 0) { - char ch = tokenText[--downto]; - - if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { - if (0 == downto) { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } else { - final char ch2 = tokenText[downto-1]; - if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { - // OK: high followed by low. This is a valid - // surrogate pair. - code = ((code*31) + ch)*31+ch2; - downto--; - continue; - } else { - // Unpaired - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } - } - } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || - ch == 0xffff)) { - // Unpaired or 0xffff - ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; - } - - code = (code*31) + ch; - } - int hashPos = code & postingsHashMask; // Locate RawPostingList in hash int termID = postingsHash[hashPos]; - if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) { + if (termID != -1 && !postingEquals(termID)) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code>>8)+code)|1; @@ -441,61 +413,86 @@ code += inc; hashPos = code & postingsHashMask; termID = postingsHash[hashPos]; - } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)); + } while (termID != -1 && !postingEquals(termID)); } if (termID == -1) { // First time we are seeing this token since we last // flushed the hash. - final int textLen1 = 1+tokenTextLen; - if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { - if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { + final int textLen2 = 2+utf8.length; + if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) { + // Not enough room in current block + + if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). + if (docState.maxTermPrefix == null) { + final int saved = utf8.length; + try { + utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8); + docState.maxTermPrefix = utf8.toString(); + } finally { + utf8.length = saved; + } + } - if (docState.maxTermPrefix == null) - docState.maxTermPrefix = new String(tokenText, 0, 30); - consumer.skippingLongTerm(); return; } - charPool.nextBuffer(); + bytePool.nextBuffer(); } // New posting termID = numPostings++; - if (termID >= postingsArray.textStarts.length) { + if (termID >= postingsArray.size) { growParallelPostingsArray(); } - if (perThread.termsHash.trackAllocations) { - perThread.termsHash.docWriter.bytesUsed(bytesPerPosting); - } assert termID != -1; + assert postingsHash[hashPos] == -1; - final char[] text = charPool.buffer; - final int textUpto = charPool.charUpto; - postingsArray.textStarts[termID] = textUpto + charPool.charOffset; - charPool.charUpto += textLen1; - System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); - text[textUpto+tokenTextLen] = 0xffff; - - assert postingsHash[hashPos] == -1; postingsHash[hashPos] = termID; - if (numPostings == postingsHashHalfSize) + final byte[] text = bytePool.buffer; + final int textUpto = bytePool.byteUpto; + postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset; + + // We first encode the length, followed by the UTF8 + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + + // encode length @ start of bytes + if (utf8.length < 128) { + // 1 byte to store length + text[textUpto] = (byte) utf8.length; + bytePool.byteUpto += utf8.length + 1; + System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length); + } else { + // 2 byte to store length + text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f)); + text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff); + bytePool.byteUpto += utf8.length + 2; + System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length); + } + + if (numPostings == postingsHashHalfSize) { rehashPostings(2*postingsHashSize); + bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT); + } // Init stream slices - if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) + if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); + } - if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.nextBuffer(); + } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; @@ -577,16 +574,28 @@ int code; if (perThread.primary) { final int textStart = postingsArray.textStarts[termID]; - final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK; - final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos = start; - while(text[pos] != 0xffff) - pos++; + final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK; + final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT]; code = 0; - while (pos > start) - code = (code*31) + text[--pos]; - } else + + final int len; + int pos; + if ((text[start] & 0x80) == 0) { + // length is 1 byte + len = text[start]; + pos = start+1; + } else { + len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7); + pos = start+2; + } + + final int endPos = pos+len; + while(pos < endPos) { + code = (code*31) + text[pos++]; + } + } else { code = postingsArray.textStarts[termID]; + } int hashPos = code & newMask; assert hashPos >= 0; @@ -603,6 +612,7 @@ postingsHashMask = newMask; postingsHash = newHash; + postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; } Index: src/java/org/apache/lucene/index/TermsHashPerThread.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerThread.java (revision 931099) +++ src/java/org/apache/lucene/index/TermsHashPerThread.java (working copy) @@ -17,6 +17,11 @@ * limitations under the License. */ +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; + import java.io.IOException; final class TermsHashPerThread extends InvertedDocConsumerPerThread { @@ -25,30 +30,54 @@ final TermsHashConsumerPerThread consumer; final TermsHashPerThread nextPerThread; - final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; + final ByteBlockPool termBytePool; final boolean primary; final DocumentsWriter.DocState docState; + // Used when comparing postings via termRefComp, in TermsHashPerField + final BytesRef tr1 = new BytesRef(); + final BytesRef tr2 = new BytesRef(); + + // Used by perField: + final BytesRef utf8 = new BytesRef(10); + + final LegacyTermAttributeWrapper legacyTermAttributeWrapper = new LegacyTermAttributeWrapper(); + + /** This class is used to wrap a legacy TermAttribute without support for {@link TermToBytesRefAttribute}. */ + @Deprecated + static class LegacyTermAttributeWrapper implements TermToBytesRefAttribute { + private TermAttribute termAtt = null; + + void setTermAttribute(TermAttribute termAtt) { + this.termAtt = termAtt; + } + + public int toBytesRef(BytesRef target) { + assert target.bytes != null : "target byteref must be != null, because utf8 is used here"; + return UnicodeUtil.UTF16toUTF8WithHash(termAtt.termBuffer(), 0, termAtt.termLength(), target); + } + } + public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) { docState = docInverterPerThread.docState; this.termsHash = termsHash; this.consumer = termsHash.consumer.addThread(this); + intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations); + bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations); + if (nextTermsHash != null) { // We are primary - charPool = new CharBlockPool(termsHash.docWriter); primary = true; + termBytePool = bytePool; } else { - charPool = primaryPerThread.charPool; primary = false; + termBytePool = primaryPerThread.bytePool; } - intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations); - bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations); - if (nextTermsHash != null) nextPerThread = nextTermsHash.addThread(docInverterPerThread, this); else @@ -97,7 +126,8 @@ intPool.reset(); bytePool.reset(); - if (primary) - charPool.reset(); + if (primary) { + bytePool.reset(); + } } } Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 931099) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -22,7 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { @@ -106,6 +106,8 @@ final int numPostings = termsHashPerField.numPostings; + final BytesRef flushTerm = perThread.flushTerm; + assert numPostings >= 0; if (!doVectors || numPostings == 0) @@ -126,7 +128,9 @@ perThread.doc.addField(termsHashPerField.fieldInfo.number); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; - final int[] termIDs = termsHashPerField.sortPostings(); + // TODO: we may want to make this sort in same order + // as Codec's terms dict? + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); tvf.writeVInt(numPostings); byte bits = 0x0; @@ -136,46 +140,40 @@ bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); - int encoderUpto = 0; - int lastTermBytesCount = 0; - + int lastLen = 0; + byte[] lastBytes = null; + int lastStart = 0; + final ByteSliceReader reader = perThread.vectorSliceReader; - final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; + final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool; + for(int j=0;j> DocumentsWriter.CHAR_BLOCK_SHIFT]; - final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK; + // Get BytesRef + termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]); - // We swap between two encoders to save copying - // last Term's byte array - final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); - final int termBytesCount = utf8Result.length; - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute common prefix between last term and + // Compute common byte prefix between last term and // this term int prefix = 0; if (j > 0) { - final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result; - final byte[] termBytes = perThread.utf8Results[encoderUpto].result; - while(prefix < lastTermBytesCount && prefix < termBytesCount) { - if (lastTermBytes[prefix] != termBytes[prefix]) + while(prefix < lastLen && prefix < flushTerm.length) { + if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) { break; + } prefix++; } } - encoderUpto = 1-encoderUpto; - lastTermBytesCount = termBytesCount; - final int suffix = termBytesCount - prefix; + lastLen = flushTerm.length; + lastBytes = flushTerm.bytes; + lastStart = flushTerm.offset; + + final int suffix = flushTerm.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); - tvf.writeBytes(utf8Result.result, prefix, suffix); + tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix); tvf.writeVInt(freq); if (doVectorPositions) { @@ -209,9 +207,7 @@ @Override void newTerm(final int termID) { - assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); - TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; postings.freqs[termID] = 1; @@ -275,23 +271,25 @@ int[] lastOffsets; // Last offset we saw int[] lastPositions; // Last position where this term occurred + ParallelPostingsArray newInstance(int size) { + return new TermVectorsPostingsArray(size); + } + @Override - ParallelPostingsArray resize(int newSize) { - TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize); - copy(this, newArray); - return newArray; + void copyTo(ParallelPostingsArray toArray, int numToCopy) { + assert toArray instanceof TermVectorsPostingsArray; + TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray; + + super.copyTo(toArray, numToCopy); + + System.arraycopy(freqs, 0, to.freqs, 0, size); + System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size); + System.arraycopy(lastPositions, 0, to.lastPositions, 0, size); } - - void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) { - super.copy(fromArray, toArray); - System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length); - System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length); - System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length); + + @Override + int bytesPerPosting() { + return super.bytesPerPosting() + 3 * DocumentsWriter.INT_NUM_BYTE; } } - - @Override - int bytesPerPosting() { - return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE; - } } Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (revision 931099) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (working copy) @@ -17,13 +17,14 @@ * limitations under the License. */ -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRef; final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread { final TermVectorsTermsWriter termsWriter; final TermsHashPerThread termsHashPerThread; final DocumentsWriter.DocState docState; + final BytesRef flushTerm = new BytesRef(); TermVectorsTermsWriter.PerDoc doc; @@ -36,9 +37,6 @@ // Used by perField when serializing the term vectors final ByteSliceReader vectorSliceReader = new ByteSliceReader(); - final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(), - new UnicodeUtil.UTF8Result()}; - @Override public void startDocument() { assert clearLastVectorFieldName(); Index: src/java/org/apache/lucene/index/TermVectorsWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 931099) +++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; @@ -28,8 +29,7 @@ private IndexOutput tvx = null, tvd = null, tvf = null; private FieldInfos fieldInfos; - final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(), - new UnicodeUtil.UTF8Result()}; + final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)}; public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) @@ -107,14 +107,14 @@ UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]); - int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result, + int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].bytes, utf8Results[1-utf8Upto].length, - utf8Results[utf8Upto].result, + utf8Results[utf8Upto].bytes, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(length); // write delta length - tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes + tvf.writeBytes(utf8Results[utf8Upto].bytes, start, length); // write delta bytes utf8Upto = 1-utf8Upto; final int termFreq = freqs[j]; Index: src/java/org/apache/lucene/search/ConstantScoreQuery.java =================================================================== --- src/java/org/apache/lucene/search/ConstantScoreQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/ConstantScoreQuery.java (working copy) @@ -161,8 +161,8 @@ /** Prints a user-readable version of this query. */ @Override public String toString(String field) { - return "ConstantScore(" + filter.toString() - + (getBoost()==1.0 ? ")" : "^" + getBoost()); + return "ConstantScore(" + filter.toString() + ")" + + (getBoost()==1.0 ? "" : "^" + getBoost()); } /** Returns true if o is equal to this. */ Index: src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 931099) +++ src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -22,9 +22,9 @@ final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, byte[] norms) { - super(weight, tps, offsets, similarity, norms); + super(weight, postings, offsets, similarity, norms); } @Override @@ -42,11 +42,11 @@ int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first - do { - if (!first.nextPosition()) - return freq; - } while (first.position < last.position); - firstToLast(); + do { + if (!first.nextPosition()) + return freq; + } while (first.position < last.position); + firstToLast(); } freq++; // all equal: a match } while (last.nextPosition()); Index: src/java/org/apache/lucene/search/FieldCache.java =================================================================== --- src/java/org/apache/lucene/search/FieldCache.java (revision 931099) +++ src/java/org/apache/lucene/search/FieldCache.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.analysis.NumericTokenStream; // for javadocs @@ -100,7 +101,7 @@ */ public interface ByteParser extends Parser { /** Return a single Byte representation of this field's value. */ - public byte parseByte(String string); + public byte parseByte(BytesRef term); } /** Interface to parse shorts from document fields. @@ -108,7 +109,7 @@ */ public interface ShortParser extends Parser { /** Return a short representation of this field's value. */ - public short parseShort(String string); + public short parseShort(BytesRef term); } /** Interface to parse ints from document fields. @@ -116,7 +117,7 @@ */ public interface IntParser extends Parser { /** Return an integer representation of this field's value. */ - public int parseInt(String string); + public int parseInt(BytesRef term); } /** Interface to parse floats from document fields. @@ -124,7 +125,7 @@ */ public interface FloatParser extends Parser { /** Return an float representation of this field's value. */ - public float parseFloat(String string); + public float parseFloat(BytesRef term); } /** Interface to parse long from document fields. @@ -132,7 +133,7 @@ */ public interface LongParser extends Parser { /** Return an long representation of this field's value. */ - public long parseLong(String string); + public long parseLong(BytesRef term); } /** Interface to parse doubles from document fields. @@ -140,16 +141,20 @@ */ public interface DoubleParser extends Parser { /** Return an long representation of this field's value. */ - public double parseDouble(String string); + public double parseDouble(BytesRef term); } /** Expert: The cache used internally by sorting and range query classes. */ public static FieldCache DEFAULT = new FieldCacheImpl(); - + /** The default parser for byte values, which are encoded by {@link Byte#toString(byte)} */ public static final ByteParser DEFAULT_BYTE_PARSER = new ByteParser() { - public byte parseByte(String value) { - return Byte.parseByte(value); + public byte parseByte(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Byte.parseByte(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_BYTE_PARSER; @@ -162,8 +167,12 @@ /** The default parser for short values, which are encoded by {@link Short#toString(short)} */ public static final ShortParser DEFAULT_SHORT_PARSER = new ShortParser() { - public short parseShort(String value) { - return Short.parseShort(value); + public short parseShort(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Short.parseShort(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_SHORT_PARSER; @@ -176,8 +185,12 @@ /** The default parser for int values, which are encoded by {@link Integer#toString(int)} */ public static final IntParser DEFAULT_INT_PARSER = new IntParser() { - public int parseInt(String value) { - return Integer.parseInt(value); + public int parseInt(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Integer.parseInt(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_INT_PARSER; @@ -190,8 +203,12 @@ /** The default parser for float values, which are encoded by {@link Float#toString(float)} */ public static final FloatParser DEFAULT_FLOAT_PARSER = new FloatParser() { - public float parseFloat(String value) { - return Float.parseFloat(value); + public float parseFloat(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Float.parseFloat(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_FLOAT_PARSER; @@ -204,8 +221,12 @@ /** The default parser for long values, which are encoded by {@link Long#toString(long)} */ public static final LongParser DEFAULT_LONG_PARSER = new LongParser() { - public long parseLong(String value) { - return Long.parseLong(value); + public long parseLong(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Long.parseLong(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_LONG_PARSER; @@ -218,8 +239,12 @@ /** The default parser for double values, which are encoded by {@link Double#toString(double)} */ public static final DoubleParser DEFAULT_DOUBLE_PARSER = new DoubleParser() { - public double parseDouble(String value) { - return Double.parseDouble(value); + public double parseDouble(BytesRef term) { + // TODO: would be far better to directly parse from + // UTF8 bytes... but really users should use + // NumericField, instead, which already decodes + // directly from byte[] + return Double.parseDouble(term.utf8ToString()); } protected Object readResolve() { return DEFAULT_DOUBLE_PARSER; @@ -231,15 +256,14 @@ }; /** - * A parser instance for int values encoded by {@link NumericUtils#intToPrefixCoded(int)}, e.g. when indexed + * A parser instance for int values encoded by {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){ - public int parseInt(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; - if (shift>0 && shift<=31) + public int parseInt(BytesRef term) { + if (NumericUtils.getPrefixCodedIntShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.prefixCodedToInt(val); + return NumericUtils.prefixCodedToInt(term); } protected Object readResolve() { return NUMERIC_UTILS_INT_PARSER; @@ -255,11 +279,10 @@ * via {@link NumericField}/{@link NumericTokenStream}. */ public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){ - public float parseFloat(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; - if (shift>0 && shift<=31) + public float parseFloat(BytesRef term) { + if (NumericUtils.getPrefixCodedIntShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(val)); + return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term)); } protected Object readResolve() { return NUMERIC_UTILS_FLOAT_PARSER; @@ -271,15 +294,14 @@ }; /** - * A parser instance for long values encoded by {@link NumericUtils#longToPrefixCoded(long)}, e.g. when indexed + * A parser instance for long values encoded by {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){ - public long parseLong(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; - if (shift>0 && shift<=63) + public long parseLong(BytesRef term) { + if (NumericUtils.getPrefixCodedLongShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.prefixCodedToLong(val); + return NumericUtils.prefixCodedToLong(term); } protected Object readResolve() { return NUMERIC_UTILS_LONG_PARSER; @@ -295,11 +317,10 @@ * via {@link NumericField}/{@link NumericTokenStream}. */ public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){ - public double parseDouble(String val) { - final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; - if (shift>0 && shift<=63) + public double parseDouble(BytesRef term) { + if (NumericUtils.getPrefixCodedLongShift(term) > 0) throw new FieldCacheImpl.StopFillCacheException(); - return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(val)); + return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term)); } protected Object readResolve() { return NUMERIC_UTILS_DOUBLE_PARSER; Index: src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 931099) +++ src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -26,9 +26,12 @@ import java.util.WeakHashMap; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.FieldCacheSanityChecker; @@ -277,22 +280,29 @@ return wrapper.getBytes(reader, field, FieldCache.DEFAULT_BYTE_PARSER); } final byte[] retArray = new byte[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - byte termval = parser.parseByte(term.text()); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final byte termval = parser.parseByte(term); + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } return retArray; } @@ -324,22 +334,29 @@ return wrapper.getShorts(reader, field, FieldCache.DEFAULT_SHORT_PARSER); } final short[] retArray = new short[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - short termval = parser.parseShort(term.text()); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final short termval = parser.parseShort(term); + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } return retArray; } @@ -375,27 +392,41 @@ } } int[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - int termval = parser.parseInt(term.text()); - if (retArray == null) // late init - retArray = new int[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final int termval = parser.parseInt(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new int[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } - if (retArray == null) // no values + + if (retArray == null) { + // no values retArray = new int[reader.maxDoc()]; + } return retArray; } } @@ -431,29 +462,43 @@ } catch (NumberFormatException ne) { return wrapper.getFloats(reader, field, NUMERIC_UTILS_FLOAT_PARSER); } - } + } float[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - float termval = parser.parseFloat(term.text()); - if (retArray == null) // late init - retArray = new float[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final float termval = parser.parseFloat(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new float[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } - if (retArray == null) // no values + + if (retArray == null) { + // no values retArray = new float[reader.maxDoc()]; + } return retArray; } } @@ -487,27 +532,41 @@ } } long[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term(field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - long termval = parser.parseLong(term.text()); - if (retArray == null) // late init - retArray = new long[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final long termval = parser.parseLong(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new long[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } - if (retArray == null) // no values + + if (retArray == null) { + // no values retArray = new long[reader.maxDoc()]; + } return retArray; } } @@ -543,24 +602,35 @@ } } double[] retArray = null; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - double termval = parser.parseDouble(term.text()); - if (retArray == null) // late init - retArray = new double[reader.maxDoc()]; - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + try { + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + final double termval = parser.parseDouble(term); + if (retArray == null) { + // late init so numeric fields don't double allocate + retArray = new double[reader.maxDoc()]; + } + + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } } - } while (termEnum.next()); - } catch (StopFillCacheException stop) { - } finally { - termDocs.close(); - termEnum.close(); + } catch (StopFillCacheException stop) { + } } if (retArray == null) // no values retArray = new double[reader.maxDoc()]; @@ -584,21 +654,27 @@ throws IOException { String field = StringHelper.intern(entryKey.field); final String[] retArray = new String[reader.maxDoc()]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; - String termval = term.text(); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = termval; + + Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; } - } while (termEnum.next()); - } finally { - termDocs.close(); - termEnum.close(); + docs = termsEnum.docs(delDocs, docs); + final String termval = term.utf8ToString(); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + retArray[docID] = termval; + } + } } return retArray; } @@ -621,8 +697,10 @@ String field = StringHelper.intern(entryKey.field); final int[] retArray = new int[reader.maxDoc()]; String[] mterms = new String[reader.maxDoc()+1]; - TermDocs termDocs = reader.termDocs(); - TermEnum termEnum = reader.terms (new Term (field)); + + //System.out.println("FC: getStringIndex field=" + field); + Terms terms = MultiFields.getTerms(reader, field); + int t = 0; // current term number // an entry for documents that have no terms in this field @@ -631,24 +709,31 @@ // needs to change as well. mterms[t++] = null; - try { - do { - Term term = termEnum.term(); - if (term==null || term.field() != field) break; + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = null; + while(true) { + final BytesRef term = termsEnum.next(); + if (term == null) { + break; + } // store term text - mterms[t] = term.text(); + mterms[t] = term.utf8ToString(); + //System.out.println("FC: ord=" + t + " term=" + term.toBytesString()); - termDocs.seek (termEnum); - while (termDocs.next()) { - retArray[termDocs.doc()] = t; + docs = termsEnum.docs(delDocs, docs); + while (true) { + final int docID = docs.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + //System.out.println("FC: docID=" + docID); + retArray[docID] = t; } - t++; - } while (termEnum.next()); - } finally { - termDocs.close(); - termEnum.close(); + } } if (t == 0) { @@ -658,16 +743,17 @@ } else if (t < mterms.length) { // if there are less terms than documents, // trim off the dead array space - String[] terms = new String[t]; - System.arraycopy (mterms, 0, terms, 0, t); - mterms = terms; + String[] newTerms = new String[t]; + System.arraycopy (mterms, 0, newTerms, 0, t); + mterms = newTerms; } StringIndex value = new StringIndex (retArray, mterms); + //System.out.println("FC: done\n"); return value; } } - + private volatile PrintStream infoStream; public void setInfoStream(PrintStream stream) { Index: src/java/org/apache/lucene/search/FieldCacheRangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/FieldCacheRangeFilter.java (revision 931099) +++ src/java/org/apache/lucene/search/FieldCacheRangeFilter.java (working copy) @@ -19,8 +19,9 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.Bits; import org.apache.lucene.document.NumericField; // for javadocs /** @@ -119,9 +120,9 @@ assert inclusiveLowerPoint > 0 && inclusiveUpperPoint > 0; - // for this DocIdSet, we never need to use TermDocs, + // for this DocIdSet, we can ignore deleted docs // because deleted docs have an order of 0 (null entry in StringIndex) - return new FieldCacheDocIdSet(reader, false) { + return new FieldCacheDocIdSet(reader, true) { @Override final boolean matchDoc(int doc) { return fcsi.order[doc] >= inclusiveLowerPoint && fcsi.order[doc] <= inclusiveUpperPoint; @@ -171,8 +172,8 @@ return DocIdSet.EMPTY_DOCIDSET; final byte[] values = FieldCache.DEFAULT.getBytes(reader, field, (FieldCache.ByteParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // we only respect deleted docs if the range contains 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -222,8 +223,8 @@ return DocIdSet.EMPTY_DOCIDSET; final short[] values = FieldCache.DEFAULT.getShorts(reader, field, (FieldCache.ShortParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -273,8 +274,8 @@ return DocIdSet.EMPTY_DOCIDSET; final int[] values = FieldCache.DEFAULT.getInts(reader, field, (FieldCache.IntParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -324,8 +325,8 @@ return DocIdSet.EMPTY_DOCIDSET; final long[] values = FieldCache.DEFAULT.getLongs(reader, field, (FieldCache.LongParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0L && inclusiveUpperPoint >= 0L)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0L && inclusiveUpperPoint >= 0L)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -379,8 +380,8 @@ return DocIdSet.EMPTY_DOCIDSET; final float[] values = FieldCache.DEFAULT.getFloats(reader, field, (FieldCache.FloatParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0f && inclusiveUpperPoint >= 0.0f)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0.0f && inclusiveUpperPoint >= 0.0f)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -434,8 +435,8 @@ return DocIdSet.EMPTY_DOCIDSET; final double[] values = FieldCache.DEFAULT.getDoubles(reader, field, (FieldCache.DoubleParser) parser); - // we only request the usage of termDocs, if the range contains 0 - return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0 && inclusiveUpperPoint >= 0.0)) { + // ignore deleted docs if range doesn't contain 0 + return new FieldCacheDocIdSet(reader, !(inclusiveLowerPoint <= 0.0 && inclusiveUpperPoint >= 0.0)) { @Override boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; @@ -503,99 +504,81 @@ static abstract class FieldCacheDocIdSet extends DocIdSet { private final IndexReader reader; - private boolean mayUseTermDocs; - - FieldCacheDocIdSet(IndexReader reader, boolean mayUseTermDocs) { + private boolean canIgnoreDeletedDocs; + + FieldCacheDocIdSet(IndexReader reader, boolean canIgnoreDeletedDocs) { this.reader = reader; - this.mayUseTermDocs = mayUseTermDocs; + this.canIgnoreDeletedDocs = canIgnoreDeletedDocs; } - - /** this method checks, if a doc is a hit, should throw AIOBE, when position invalid */ + + /** + * this method checks, if a doc is a hit, should throw AIOBE, when position + * invalid + */ abstract boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException; - - /** this DocIdSet is cacheable, if it works solely with FieldCache and no TermDocs */ + + /** + * this DocIdSet is cacheable, if it can ignore deletions + */ @Override public boolean isCacheable() { - return !(mayUseTermDocs && reader.hasDeletions()); + return canIgnoreDeletedDocs || !reader.hasDeletions(); } @Override public DocIdSetIterator iterator() throws IOException { // Synchronization needed because deleted docs BitVector // can change after call to hasDeletions until TermDocs creation. - // We only use an iterator with termDocs, when this was requested (e.g. range contains 0) + // We only use an iterator with termDocs, when this was requested (e.g. + // range contains 0) // and the index has deletions - final TermDocs termDocs; - synchronized(reader) { - termDocs = isCacheable() ? null : reader.termDocs(null); + + final Bits skipDocs; + synchronized (reader) { + if (isCacheable()) { + skipDocs = null; + } else { + skipDocs = MultiFields.getDeletedDocs(reader); + } } - if (termDocs != null) { - // a DocIdSetIterator using TermDocs to iterate valid docIds - return new DocIdSetIterator() { - private int doc = -1; - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { + final int maxDoc = reader.maxDoc(); + + // a DocIdSetIterator generating docIds by + // incrementing a variable & checking skipDocs - + return new DocIdSetIterator() { + private int doc = -1; + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() { + try { do { - if (!termDocs.next()) - return doc = NO_MORE_DOCS; - } while (!matchDoc(doc = termDocs.doc())); + doc++; + } while ((skipDocs != null && doc < maxDoc && skipDocs.get(doc)) + || !matchDoc(doc)); return doc; + } catch (ArrayIndexOutOfBoundsException e) { + return doc = NO_MORE_DOCS; } - - @Override - public int advance(int target) throws IOException { - if (!termDocs.skipTo(target)) - return doc = NO_MORE_DOCS; - while (!matchDoc(doc = termDocs.doc())) { - if (!termDocs.next()) - return doc = NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + try { + doc = target; + while (!matchDoc(doc)) { + doc++; } return doc; + } catch (ArrayIndexOutOfBoundsException e) { + return doc = NO_MORE_DOCS; } - }; - } else { - // a DocIdSetIterator generating docIds by incrementing a variable - - // this one can be used if there are no deletions are on the index - return new DocIdSetIterator() { - private int doc = -1; - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - try { - do { - doc++; - } while (!matchDoc(doc)); - return doc; - } catch (ArrayIndexOutOfBoundsException e) { - return doc = NO_MORE_DOCS; - } - } - - @Override - public int advance(int target) { - try { - doc = target; - while (!matchDoc(doc)) { - doc++; - } - return doc; - } catch (ArrayIndexOutOfBoundsException e) { - return doc = NO_MORE_DOCS; - } - } - }; - } + + } + }; } } Index: src/java/org/apache/lucene/search/FilteredTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/FilteredTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/FilteredTermEnum.java (working copy) @@ -24,7 +24,11 @@ /** Abstract class for enumerating a subset of all terms.

    Term enumerations are always ordered by Term.compareTo(). Each term in - the enumeration is greater than all that precede it. */ + the enumeration is greater than all that precede it. + + @deprecated Switch to {@link FilteredTermsEnum} instead. +*/ +@Deprecated public abstract class FilteredTermEnum extends TermEnum { /** the current term */ protected Term currentTerm = null; @@ -37,7 +41,14 @@ /** Equality compare on the term */ protected abstract boolean termCompare(Term term); - /** Equality measure on the term */ + /** Equality measure on the term, it is in reality a boost + * factor and used like so in {@link MultiTermQuery}, + * so the name is wrong. + * @deprecated Use {@link MultiTermQuery.BoostAttribute} + * together with {@link FilteredTermsEnum}. For example + * see {@link FuzzyTermsEnum} + */ + @Deprecated public abstract float difference(); /** Indicates the end of the enumeration has been reached */ Index: src/java/org/apache/lucene/search/function/ValueSourceQuery.java =================================================================== --- src/java/org/apache/lucene/search/function/ValueSourceQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/function/ValueSourceQuery.java (working copy) @@ -18,10 +18,11 @@ */ import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.*; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.search.*; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.Bits; import java.io.IOException; import java.util.Set; @@ -56,7 +57,7 @@ return this; } - /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ + /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(Set) */ @Override public void extractTerms(Set terms) { // no terms involved here @@ -127,7 +128,8 @@ private class ValueSourceScorer extends Scorer { private final float qWeight; private final DocValues vals; - private final TermDocs termDocs; + private final Bits delDocs; + private final int maxDoc; private int doc = -1; // constructor @@ -136,28 +138,37 @@ qWeight = w.getValue(); // this is when/where the values are first created. vals = valSrc.getValues(reader); - termDocs = reader.termDocs(null); + delDocs = MultiFields.getDeletedDocs(reader); + maxDoc = reader.maxDoc(); } @Override public int nextDoc() throws IOException { - return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; + doc++; + while (delDocs != null && doc < maxDoc && delDocs.get(doc)) { + doc++; + } + if (doc == maxDoc) { + doc = NO_MORE_DOCS; + } + return doc; } - + @Override public int docID() { return doc; } - + @Override public int advance(int target) throws IOException { - return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; + doc = target - 1; + return nextDoc(); } /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ @Override public float score() throws IOException { - return qWeight * vals.floatVal(termDocs.doc()); + return qWeight * vals.floatVal(doc); } } Index: src/java/org/apache/lucene/search/FuzzyQuery.java =================================================================== --- src/java/org/apache/lucene/search/FuzzyQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/FuzzyQuery.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -68,6 +69,7 @@ */ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength, int maxExpansions) { + super(term.field()); this.term = term; if (minimumSimilarity >= 1.0f) @@ -127,7 +129,7 @@ return prefixLength; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { if (!termLongEnough) { // can only match if it's exact return new SingleTermEnum(reader, term); @@ -135,6 +137,14 @@ return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); } + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (!termLongEnough) { // can only match if it's exact + return new SingleTermsEnum(reader, term); + } + return new FuzzyTermsEnum(reader, getTerm(), minimumSimilarity, prefixLength); + } + /** * Returns the pattern term. */ Index: src/java/org/apache/lucene/search/FuzzyTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/FuzzyTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/FuzzyTermEnum.java (working copy) @@ -27,7 +27,10 @@ * *

    Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. + * + * @deprecated Please use {@link FuzzyTermsEnum} instead. */ +@Deprecated public final class FuzzyTermEnum extends FilteredTermEnum { /* Allows us save time required to create a new array @@ -136,7 +139,8 @@ return false; } - /** {@inheritDoc} */ + /** @deprecated Use {@link MultiTermQuery.BoostAttribute} together with {@link FuzzyTermsEnum} */ + @Deprecated @Override public final float difference() { return (similarity - minimumSimilarity) * scale_factor; Index: src/java/org/apache/lucene/search/MatchAllDocsQuery.java =================================================================== --- src/java/org/apache/lucene/search/MatchAllDocsQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/MatchAllDocsQuery.java (working copy) @@ -19,8 +19,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.Bits; import java.util.Set; import java.io.IOException; @@ -45,16 +46,18 @@ } private class MatchAllScorer extends Scorer { - final TermDocs termDocs; final float score; final byte[] norms; private int doc = -1; + private final int maxDoc; + private final Bits delDocs; MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, byte[] norms) throws IOException { super(similarity); - this.termDocs = reader.termDocs(null); + delDocs = MultiFields.getDeletedDocs(reader); score = w.getValue(); + maxDoc = reader.maxDoc(); this.norms = norms; } @@ -65,7 +68,14 @@ @Override public int nextDoc() throws IOException { - return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; + doc++; + while(delDocs != null && doc < maxDoc && delDocs.get(doc)) { + doc++; + } + if (doc == maxDoc) { + doc = NO_MORE_DOCS; + } + return doc; } @Override @@ -75,7 +85,8 @@ @Override public int advance(int target) throws IOException { - return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; + doc = target-1; + return nextDoc(); } } Index: src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -21,10 +21,14 @@ import java.util.*; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultipleTermPositions; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.Bits; /** * MultiPhraseQuery is a generalized version of PhraseQuery, with an added @@ -167,27 +171,31 @@ if (termArrays.size() == 0) // optimize zero-term case return null; - TermPositions[] tps = new TermPositions[termArrays.size()]; - for (int i=0; i 1) - p = new MultipleTermPositions(reader, terms); - else - p = reader.termPositions(terms[0]); + final DocsAndPositionsEnum postingsEnum; + if (terms.length > 1) { + postingsEnum = new UnionDocsAndPositionsEnum(reader, terms); + } else { + postingsEnum = reader.termPositionsEnum(MultiFields.getDeletedDocs(reader), + terms[0].field(), + new BytesRef(terms[0].text())); + } - if (p == null) + if (postingsEnum == null) { return null; + } - tps[i] = p; + postings[i] = postingsEnum; } if (slop == 0) - return new ExactPhraseScorer(this, tps, getPositions(), similarity, + return new ExactPhraseScorer(this, postings, getPositions(), similarity, reader.norms(field)); else - return new SloppyPhraseScorer(this, tps, getPositions(), similarity, + return new SloppyPhraseScorer(this, postings, getPositions(), similarity, slop, reader.norms(field)); } @@ -370,3 +378,169 @@ return true; } } + +/** + * Takes the logical union of multiple DocsEnum iterators. + */ + +// TODO: if ever we allow subclassing of the *PhraseScorer +class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { + + private static final class DocsQueue extends PriorityQueue { + DocsQueue(List docsEnums) throws IOException { + initialize(docsEnums.size()); + + Iterator i = docsEnums.iterator(); + while (i.hasNext()) { + DocsAndPositionsEnum postings = (DocsAndPositionsEnum) i.next(); + if (postings.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { + add(postings); + } + } + } + + final public DocsEnum peek() { + return top(); + } + + @Override + public final boolean lessThan(DocsAndPositionsEnum a, DocsAndPositionsEnum b) { + return a.docID() < b.docID(); + } + } + + private static final class IntQueue { + private int _arraySize = 16; + private int _index = 0; + private int _lastIndex = 0; + private int[] _array = new int[_arraySize]; + + final void add(int i) { + if (_lastIndex == _arraySize) + growArray(); + + _array[_lastIndex++] = i; + } + + final int next() { + return _array[_index++]; + } + + final void sort() { + Arrays.sort(_array, _index, _lastIndex); + } + + final void clear() { + _index = 0; + _lastIndex = 0; + } + + final int size() { + return (_lastIndex - _index); + } + + private void growArray() { + int[] newArray = new int[_arraySize * 2]; + System.arraycopy(_array, 0, newArray, 0, _arraySize); + _array = newArray; + _arraySize *= 2; + } + } + + private int _doc; + private int _freq; + private DocsQueue _queue; + private IntQueue _posList; + + public UnionDocsAndPositionsEnum(IndexReader indexReader, Term[] terms) throws IOException { + List docsEnums = new LinkedList(); + final Bits delDocs = MultiFields.getDeletedDocs(indexReader); + for (int i = 0; i < terms.length; i++) { + DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, + terms[i].field(), + new BytesRef(terms[i].text())); + if (postings != null) { + docsEnums.add(postings); + } + } + + _queue = new DocsQueue(docsEnums); + _posList = new IntQueue(); + } + + @Override + public final int nextDoc() throws IOException { + if (_queue.size() == 0) { + return NO_MORE_DOCS; + } + + // TODO: move this init into positions(): if the search + // doesn't need the positions for this doc then don't + // waste CPU merging them: + _posList.clear(); + _doc = _queue.top().docID(); + + // merge sort all positions together + DocsAndPositionsEnum postings; + do { + postings = _queue.top(); + + final int freq = postings.freq(); + for (int i = 0; i < freq; i++) { + _posList.add(postings.nextPosition()); + } + + if (postings.nextDoc() != NO_MORE_DOCS) { + _queue.updateTop(); + } else { + _queue.pop(); + } + } while (_queue.size() > 0 && _queue.top().docID() == _doc); + + _posList.sort(); + _freq = _posList.size(); + + return _doc; + } + + @Override + public int nextPosition() { + return _posList.next(); + } + + @Override + public int getPayloadLength() { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef getPayload() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPayload() { + throw new UnsupportedOperationException(); + } + + @Override + public final int advance(int target) throws IOException { + while (_queue.top() != null && target > _queue.top().docID()) { + DocsAndPositionsEnum postings = _queue.pop(); + if (postings.advance(target) != NO_MORE_DOCS) { + _queue.add(postings); + } + } + return nextDoc(); + } + + @Override + public final int freq() { + return _freq; + } + + @Override + public final int docID() { + return _doc; + } +} Index: src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -24,17 +24,24 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; - +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; import org.apache.lucene.queryParser.QueryParser; // for javadoc +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.VirtualMethod; /** * An abstract {@link Query} that matches documents * containing a subset of terms provided by a {@link - * FilteredTermEnum} enumeration. + * FilteredTermsEnum} enumeration. * *

    This query cannot be used directly; you must subclass - * it and define {@link #getEnum} to provide a {@link - * FilteredTermEnum} that iterates through the terms to be + * it and define {@link #getTermsEnum} to provide a {@link + * FilteredTermsEnum} that iterates through the terms to be * matched. * *

    NOTE: if {@link #setRewriteMethod} is either @@ -61,9 +68,91 @@ * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default. */ public abstract class MultiTermQuery extends Query { + protected final String field; protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; + + /** @deprecated remove when getEnum is removed */ + private static final VirtualMethod getEnumMethod = + new VirtualMethod(MultiTermQuery.class, "getEnum", IndexReader.class); + /** @deprecated remove when getEnum is removed */ + private static final VirtualMethod getTermsEnumMethod = + new VirtualMethod(MultiTermQuery.class, "getTermsEnum", IndexReader.class); + /** @deprecated remove when getEnum is removed */ + final boolean hasNewAPI = + VirtualMethod.compareImplementationDistance(getClass(), + getTermsEnumMethod, getEnumMethod) >= 0; // its ok for both to be overridden + /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum} + * and update the boost on each returned term. This enables to control the boost factor + * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or + * {@link TopTermsBooleanQueryRewrite} mode. + * {@link FuzzyQuery} is using this to take the edit distance into account. + */ + public static interface BoostAttribute extends Attribute { + /** Sets the boost in this attribute */ + public void setBoost(float boost); + /** Retrieves the boost, default is {@code 1.0f}. */ + public float getBoost(); + /** Sets the maximum boost for terms that would never get + * into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}. + * This value is not changed by {@link AttributeImpl#clear} + * and not used in {@code equals()} and {@code hashCode()}. + * Do not change the value in the {@link TermsEnum}! + */ + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); + /** Retrieves the maximum boost that is not competitive, + * default is megative infinity. You can use this boost value + * as a hint when writing the {@link TermsEnum}. + */ + public float getMaxNonCompetitiveBoost(); + } + + /** Implementation class for {@link BoostAttribute}. */ + public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { + private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + + public void setBoost(float boost) { + this.boost = boost; + } + + public float getBoost() { + return boost; + } + + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { + this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; + } + + public float getMaxNonCompetitiveBoost() { + return maxNonCompetitiveBoost; + } + + @Override + public void clear() { + boost = 1.0f; + } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other instanceof BoostAttributeImpl) + return ((BoostAttributeImpl) other).boost == boost; + return false; + } + + @Override + public int hashCode() { + return Float.floatToIntBits(boost); + } + + @Override + public void copyTo(AttributeImpl target) { + ((BoostAttribute) target).setBoost(boost); + } + } + /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; @@ -100,30 +189,79 @@ private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final FilteredTermEnum enumerator = query.getEnum(reader); - int count = 0; - try { - do { - Term t = enumerator.term(); - if (t != null) { - if (collector.collect(t, enumerator.difference())) { - count++; - } else { - break; + + if (query.hasNewAPI) { + + if (query.field == null) { + throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); + } + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields + return 0; + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + return 0; + } + + final TermsEnum termsEnum = query.getTermsEnum(reader); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) + return 0; + final BoostAttribute boostAtt = + termsEnum.attributes().addAttribute(BoostAttribute.class); + collector.boostAtt = boostAtt; + int count = 0; + BytesRef term; + final Term placeholderTerm = new Term(query.field); + while ((term = termsEnum.next()) != null) { + if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) { + count++; + } else { + break; + } + } + collector.boostAtt = null; + return count; + } else { + // deprecated case + final FilteredTermEnum enumerator = query.getEnum(reader); + int count = 0; + try { + do { + Term t = enumerator.term(); + if (t != null) { + if (collector.collect(t, enumerator.difference())) { + count++; + } else { + break; + } } - } - } while (enumerator.next()); - } finally { - enumerator.close(); + } while (enumerator.next()); + } finally { + enumerator.close(); + } + return count; } - return count; } - protected interface TermCollector { + protected static abstract class TermCollector { + /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */ + private BoostAttribute boostAtt = null; + /** return false to stop collecting */ - boolean collect(Term t, float boost) throws IOException; + public abstract boolean collect(Term t, float boost) throws IOException; + + /** set the minimum boost as a hint for the term producer */ + protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { + if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + } } - } private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { @@ -207,6 +345,7 @@ stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); + setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); return true; } @@ -338,6 +477,7 @@ public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { Query result = super.rewrite(reader, query); assert result instanceof BooleanQuery; + // TODO: if empty boolean query return NullQuery? if (!((BooleanQuery) result).clauses().isEmpty()) { // strip the scores off result = new ConstantScoreQuery(new QueryWrapperFilter(result)); @@ -448,7 +588,7 @@ } } - private static final class CutOffTermCollector implements TermCollector { + private static final class CutOffTermCollector extends TermCollector { CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { this.reader = reader; this.docCountCutoff = docCountCutoff; @@ -465,6 +605,7 @@ // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: + // @deprecated: in 4.0 use BytesRef for collectTerms() docVisitCount += reader.docFreq(t); return true; } @@ -538,13 +679,45 @@ * Constructs a query matching terms that cannot be represented with a single * Term. */ + public MultiTermQuery(final String field) { + this.field = field; + } + + /** + * Constructs a query matching terms that cannot be represented with a single + * Term. + * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can + * only work on one field per terms enum. If you override + * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor. + */ + @Deprecated public MultiTermQuery() { + this(null); } - /** Construct the enumeration to be used, expanding the pattern term. */ - protected abstract FilteredTermEnum getEnum(IndexReader reader) - throws IOException; + /** Returns the field name for this query */ + public final String getField() { return field; } + /** Construct the enumeration to be used, expanding the + * pattern term. + * @deprecated Please override {@link #getTermsEnum} instead */ + @Deprecated + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Construct the enumeration to be used, expanding the + * pattern term. This method should only be called if + * the field exists (ie, implementations can assume the + * field does exist). This method should not return null + * (should instead return {@link TermsEnum#EMPTY} if no + * terms match). The TermsEnum must already be + * positioned to the first matching term. */ + // TODO 4.0: make this method abstract + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + throw new UnsupportedOperationException(); + } + /** * Expert: Return the number of unique terms visited during execution of the query. * If there are many of them, you may consider using another query type @@ -602,8 +775,8 @@ final int prime = 31; int result = 1; result = prime * result + Float.floatToIntBits(getBoost()); - result = prime * result; - result += rewriteMethod.hashCode(); + result = prime * result + rewriteMethod.hashCode(); + if (field != null) result = prime * result + field.hashCode(); return result; } @@ -621,7 +794,7 @@ if (!rewriteMethod.equals(other.rewriteMethod)) { return false; } - return true; + return (other.field == null ? field == null : other.field.equals(field)); } } Index: src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java =================================================================== --- src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (revision 931099) +++ src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (working copy) @@ -21,9 +21,15 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Bits; /** * A wrapper for {@link MultiTermQuery}, that exposes its @@ -70,6 +76,9 @@ public final int hashCode() { return query.hashCode(); } + + /** Returns the field name for this query */ + public final String getField() { return query.getField(); } /** * Expert: Return the number of unique terms visited during execution of the filter. @@ -95,49 +104,101 @@ } /** - * Returns a DocIdSet with documents that should be - * permitted in search results. + * Returns a DocIdSet with documents that should be permitted in search + * results. */ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { - final TermEnum enumerator = query.getEnum(reader); - try { - // if current term in enum is null, the enum is empty -> shortcut - if (enumerator.term() == null) + if (query.hasNewAPI) { + if (query.field == null) { + throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); + } + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields return DocIdSet.EMPTY_DOCIDSET; - // else fill into a OpenBitSet - final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); - final int[] docs = new int[32]; - final int[] freqs = new int[32]; - TermDocs termDocs = reader.termDocs(); - try { + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + return DocIdSet.EMPTY_DOCIDSET; + } + + final TermsEnum termsEnum = query.getTermsEnum(reader); + assert termsEnum != null; + if (termsEnum.next() != null) { + // fill into a OpenBitSet + final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); int termCount = 0; + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docsEnum = null; do { - Term term = enumerator.term(); - if (term == null) - break; termCount++; - termDocs.seek(term); + // System.out.println(" iter termCount=" + termCount + " term=" + + // enumerator.term().toBytesString()); + docsEnum = termsEnum.docs(delDocs, docsEnum); + final DocsEnum.BulkReadResult result = docsEnum.getBulkResult(); while (true) { - final int count = termDocs.read(docs, freqs); + final int count = docsEnum.read(); if (count != 0) { - for(int i=0;i=1"); - this.field = StringHelper.intern(field); this.precisionStep = precisionStep; this.valSize = valSize; this.min = min; @@ -299,15 +299,15 @@ ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); } - - @Override - protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { - return new NumericRangeTermEnum(reader); + + @Override @SuppressWarnings("unchecked") + protected TermsEnum getTermsEnum(final IndexReader reader) throws IOException { + // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are + return (min != null && max != null && ((Comparable) min).compareTo(max) > 0) ? + TermsEnum.EMPTY : + new NumericRangeTermsEnum(reader); } - /** Returns the field name for this query */ - public String getField() { return field; } - /** Returns true if the lower endpoint is inclusive */ public boolean includesMin() { return minInclusive; } @@ -323,7 +323,7 @@ @Override public String toString(final String field) { final StringBuilder sb = new StringBuilder(); - if (!this.field.equals(field)) sb.append(this.field).append(':'); + if (!getField().equals(field)) sb.append(getField()).append(':'); return sb.append(minInclusive ? '[' : '{') .append((min == null) ? "*" : min.toString()) .append(" TO ") @@ -341,7 +341,6 @@ if (o instanceof NumericRangeQuery) { final NumericRangeQuery q=(NumericRangeQuery)o; return ( - field==q.field && (q.min == null ? min == null : q.min.equals(min)) && (q.max == null ? max == null : q.max.equals(max)) && minInclusive == q.minInclusive && @@ -355,29 +354,22 @@ @Override public final int hashCode() { int hash = super.hashCode(); - hash += field.hashCode()^0x4565fd66 + precisionStep^0x64365465; + hash += precisionStep^0x64365465; if (min != null) hash += min.hashCode()^0x14fa55fb; if (max != null) hash += max.hashCode()^0x733fa5fe; return hash + (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); } - - // field must be interned after reading from stream - private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { - in.defaultReadObject(); - field = StringHelper.intern(field); - } // members (package private, to be also fast accessible by NumericRangeTermEnum) - String field; final int precisionStep, valSize; final T min, max; final boolean minInclusive,maxInclusive; /** - * Subclass of FilteredTermEnum for enumerating all terms that match the - * sub-ranges for trie range queries. + * Subclass of FilteredTermsEnum for enumerating all terms that match the + * sub-ranges for trie range queries, using flex API. *

    * WARNING: This term enumeration is not guaranteed to be always ordered by * {@link Term#compareTo}. @@ -385,16 +377,15 @@ * {@link NumericUtils#splitIntRange} generates the sub-ranges. For * {@link MultiTermQuery} ordering is not relevant. */ - private final class NumericRangeTermEnum extends FilteredTermEnum { + private final class NumericRangeTermsEnum extends FilteredTermsEnum { - private final IndexReader reader; - private final LinkedList rangeBounds = new LinkedList(); - private final Term termTemplate = new Term(field); - private String currentUpperBound = null; + private BytesRef currentLowerBound, currentUpperBound; - NumericRangeTermEnum(final IndexReader reader) throws IOException { - this.reader = reader; - + private final LinkedList rangeBounds = new LinkedList(); + private final Comparator termComp; + + NumericRangeTermsEnum(final IndexReader reader) throws IOException { + super(reader, getField()); switch (valSize) { case 64: { // lower @@ -423,7 +414,7 @@ NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() { @Override - public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + public final void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } @@ -458,7 +449,7 @@ NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() { @Override - public final void addRange(String minPrefixCoded, String maxPrefixCoded) { + public final void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } @@ -470,85 +461,32 @@ // should never happen throw new IllegalArgumentException("valSize must be 32 or 64"); } - - // seek to first term - next(); - } - @Override - public float difference() { - return 1.0f; + termComp = getComparator(); } - /** this is a dummy, it is not used by this class. */ @Override - protected boolean endEnum() { - throw new UnsupportedOperationException("not implemented"); - } - - /** this is a dummy, it is not used by this class. */ - @Override - protected void setEnum(TermEnum tenum) { - throw new UnsupportedOperationException("not implemented"); - } - - /** - * Compares if current upper bound is reached. - * In contrast to {@link FilteredTermEnum}, a return value - * of false ends iterating the current enum - * and forwards to the next sub-range. - */ - @Override - protected boolean termCompare(Term term) { - return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0); - } - - /** Increments the enumeration to the next element. True if one exists. */ - @Override - public boolean next() throws IOException { - // if a current term exists, the actual enum is initialized: - // try change to next term, if no such term exists, fall-through - if (currentTerm != null) { - assert actualEnum != null; - if (actualEnum.next()) { - currentTerm = actualEnum.term(); - if (termCompare(currentTerm)) - return true; - } - } - - // if all above fails, we go forward to the next enum, - // if one is available - currentTerm = null; - while (rangeBounds.size() >= 2) { + protected final BytesRef nextSeekTerm(BytesRef term) throws IOException { + if (rangeBounds.size() >= 2) { assert rangeBounds.size() % 2 == 0; - // close the current enum and read next bounds - if (actualEnum != null) { - actualEnum.close(); - actualEnum = null; - } - final String lowerBound = rangeBounds.removeFirst(); + + this.currentLowerBound = rangeBounds.removeFirst(); + assert currentUpperBound == null || termComp.compare(currentUpperBound, currentLowerBound) <= 0 : + "The current upper bound must be <= the new lower bound"; + this.currentUpperBound = rangeBounds.removeFirst(); - // create a new enum - actualEnum = reader.terms(termTemplate.createTerm(lowerBound)); - currentTerm = actualEnum.term(); - if (currentTerm != null && termCompare(currentTerm)) - return true; - // clear the current term for next iteration - currentTerm = null; + return currentLowerBound; } // no more sub-range enums available - assert rangeBounds.size() == 0 && currentTerm == null; - return false; + assert rangeBounds.size() == 0; + return null; } - - /** Closes the enumeration to further activity, freeing resources. */ + @Override - public void close() throws IOException { - rangeBounds.clear(); - currentUpperBound = null; - super.close(); + protected AcceptStatus accept(BytesRef term) { + return (currentUpperBound != null && termComp.compare(term, currentUpperBound) <= 0) ? + AcceptStatus.YES : AcceptStatus.NO_AND_SEEK; } } Index: src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (working copy) @@ -18,8 +18,8 @@ */ import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; @@ -30,6 +30,7 @@ import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.SpanScorer; +import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -80,16 +81,15 @@ } protected class PayloadTermSpanScorer extends SpanScorer { - // TODO: is this the best way to allocate this? - protected byte[] payload = new byte[256]; - protected TermPositions positions; + protected BytesRef payload; protected float payloadScore; protected int payloadsSeen; + private final TermSpans termSpans; public PayloadTermSpanScorer(TermSpans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); - positions = spans.getPositions(); + termSpans = spans; } @Override @@ -115,12 +115,24 @@ } protected void processPayload(Similarity similarity) throws IOException { - if (positions.isPayloadAvailable()) { - payload = positions.getPayload(payload, 0); - payloadScore = function.currentScore(doc, term.field(), - spans.start(), spans.end(), payloadsSeen, payloadScore, - similarity.scorePayload(doc, term.field(), spans.start(), spans - .end(), payload, 0, positions.getPayloadLength())); + final DocsAndPositionsEnum postings = termSpans.getPostings(); + if (postings.hasPayload()) { + payload = postings.getPayload(); + if (payload != null) { + payloadScore = function.currentScore(doc, term.field(), + spans.start(), spans.end(), payloadsSeen, payloadScore, + similarity.scorePayload(doc, term.field(), spans.start(), + spans.end(), payload.bytes, + payload.offset, + payload.length)); + } else { + payloadScore = function.currentScore(doc, term.field(), + spans.start(), spans.end(), payloadsSeen, payloadScore, + similarity.scorePayload(doc, term.field(), spans.start(), + spans.end(), null, + 0, + 0)); + } payloadsSeen++; } else { Index: src/java/org/apache/lucene/search/PhrasePositions.java =================================================================== --- src/java/org/apache/lucene/search/PhrasePositions.java (revision 931099) +++ src/java/org/apache/lucene/search/PhrasePositions.java (working copy) @@ -28,40 +28,33 @@ int position; // position in doc int count; // remaining pos in this doc int offset; // position in phrase - TermPositions tp; // stream of positions - PhrasePositions next; // used to make lists + final DocsAndPositionsEnum postings; // stream of docs & positions + PhrasePositions next; // used to make lists boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) - PhrasePositions(TermPositions t, int o) { - tp = t; + PhrasePositions(DocsAndPositionsEnum postings, int o) { + this.postings = postings; offset = o; } final boolean next() throws IOException { // increments to next doc - if (!tp.next()) { - tp.close(); // close stream - doc = Integer.MAX_VALUE; // sentinel value + doc = postings.nextDoc(); + if (doc == postings.NO_MORE_DOCS) { return false; } - doc = tp.doc(); - position = 0; return true; } final boolean skipTo(int target) throws IOException { - if (!tp.skipTo(target)) { - tp.close(); // close stream - doc = Integer.MAX_VALUE; // sentinel value + doc = postings.advance(target); + if (doc == postings.NO_MORE_DOCS) { return false; } - doc = tp.doc(); - position = 0; return true; } - final void firstPosition() throws IOException { - count = tp.freq(); // read first pos + count = postings.freq(); // read first pos nextPosition(); } @@ -73,7 +66,7 @@ */ final boolean nextPosition() throws IOException { if (count-- > 0) { // read subsequent pos's - position = tp.nextPosition() - offset; + position = postings.nextPosition() - offset; return true; } else return false; Index: src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/PhraseQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -22,10 +22,13 @@ import java.util.ArrayList; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.Bits; /** A Query that matches documents containing a particular sequence of terms. * A PhraseQuery is built by QueryParser for input like "new york". @@ -150,20 +153,35 @@ if (terms.size() == 0) // optimize zero-term case return null; - TermPositions[] tps = new TermPositions[terms.size()]; + DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[terms.size()]; + final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { - TermPositions p = reader.termPositions(terms.get(i)); - if (p == null) - return null; - tps[i] = p; + final Term t = terms.get(i); + final BytesRef text = new BytesRef(t.text()); + DocsAndPositionsEnum postingsEnum = MultiFields.getTermPositionsEnum(reader, + delDocs, + t.field(), + text); + // PhraseQuery on a field that did not index + // positions. + if (postingsEnum == null) { + if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), text) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); + } else { + // term does not exist + return null; + } + } + postings[i] = postingsEnum; } if (slop == 0) // optimize exact case - return new ExactPhraseScorer(this, tps, getPositions(), similarity, + return new ExactPhraseScorer(this, postings, getPositions(), similarity, reader.norms(field)); else return - new SloppyPhraseScorer(this, tps, getPositions(), similarity, slop, + new SloppyPhraseScorer(this, postings, getPositions(), similarity, slop, reader.norms(field)); } Index: src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/PhraseScorer.java (revision 931099) +++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -19,7 +19,7 @@ import java.io.IOException; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; /** Expert: Scoring functionality for phrase queries. *
    A document is considered matching if it contains the phrase-query terms @@ -43,7 +43,7 @@ private float freq; //phrase frequency in current doc as computed by phraseFreq(). - PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + PhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, byte[] norms) { super(similarity); this.norms = norms; @@ -55,8 +55,8 @@ // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. - for (int i = 0; i < tps.length; i++) { - PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); + for (int i = 0; i < postings.length; i++) { + PhrasePositions pp = new PhrasePositions(postings[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else { @@ -65,7 +65,7 @@ last = pp; } - pq = new PhraseQueue(tps.length); // construct empty pq + pq = new PhraseQueue(postings.length); // construct empty pq first.doc = -1; } Index: src/java/org/apache/lucene/search/PrefixQuery.java =================================================================== --- src/java/org/apache/lucene/search/PrefixQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/PrefixQuery.java (working copy) @@ -20,7 +20,10 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing terms with a specified prefix. A PrefixQuery @@ -34,23 +37,34 @@ /** Constructs a query for terms starting with prefix. */ public PrefixQuery(Term prefix) { + super(prefix.field()); this.prefix = prefix; } /** Returns the prefix of this query. */ public Term getPrefix() { return prefix; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new PrefixTermEnum(reader, prefix); } + + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (prefix.text().length() == 0) { + // no prefix -- match all terms for this field: + final Terms terms = MultiFields.getTerms(reader, getField()); + return (terms != null) ? terms.iterator() : TermsEnum.EMPTY; + } + return new PrefixTermsEnum(reader, prefix); + } /** Prints a user-readable version of this query. */ @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); - if (!prefix.field().equals(field)) { - buffer.append(prefix.field()); + if (!getField().equals(field)) { + buffer.append(getField()); buffer.append(":"); } buffer.append(prefix.text()); Index: src/java/org/apache/lucene/search/PrefixTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/PrefixTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/PrefixTermEnum.java (working copy) @@ -29,7 +29,9 @@ * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * + * @deprecated Use {@link PrefixTermsEnum} instead. */ +@Deprecated public class PrefixTermEnum extends FilteredTermEnum { private final Term prefix; Index: src/java/org/apache/lucene/search/Similarity.java =================================================================== --- src/java/org/apache/lucene/search/Similarity.java (revision 931099) +++ src/java/org/apache/lucene/search/Similarity.java (working copy) @@ -857,6 +857,7 @@ * @return An implementation dependent float to be used as a scoring factor * */ + // TODO: maybe switch this API to BytesRef? public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length) { return 1; Index: src/java/org/apache/lucene/search/SingleTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/SingleTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/SingleTermEnum.java (working copy) @@ -29,6 +29,7 @@ * but want to preserve MultiTermQuery semantics such as * {@link MultiTermQuery#rewriteMethod}. */ +@Deprecated public class SingleTermEnum extends FilteredTermEnum { private Term singleTerm; private boolean endEnum = false; Index: src/java/org/apache/lucene/search/SloppyPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 931099) +++ src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy) @@ -17,7 +17,7 @@ * limitations under the License. */ -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; import java.io.IOException; import java.util.HashMap; @@ -28,9 +28,9 @@ private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; - SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, + SloppyPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, Similarity similarity, int slop, byte[] norms) { - super(weight, tps, offsets, similarity, norms); + super(weight, postings, offsets, similarity, norms); this.slop = slop; } Index: src/java/org/apache/lucene/search/spans/Spans.java =================================================================== --- src/java/org/apache/lucene/search/spans/Spans.java (revision 931099) +++ src/java/org/apache/lucene/search/spans/Spans.java (working copy) @@ -83,5 +83,4 @@ * @return true if there is a payload available at this position that can be loaded */ public abstract boolean isPayloadAvailable(); - } Index: src/java/org/apache/lucene/search/spans/SpanTermQuery.java =================================================================== --- src/java/org/apache/lucene/search/spans/SpanTermQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/spans/SpanTermQuery.java (working copy) @@ -19,6 +19,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -39,7 +42,7 @@ @Override public void extractTerms(Set terms) { - terms.add(term); + terms.add(term); } @Override @@ -80,7 +83,24 @@ @Override public Spans getSpans(final IndexReader reader) throws IOException { - return new TermSpans(reader.termPositions(term), term); + // NOTE: debateably, the caller should never pass in a + // multi reader... + final BytesRef textBytes = new BytesRef(term.text()); + final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, + MultiFields.getDeletedDocs(reader), + term.field(), + textBytes); + + if (postings != null) { + return new TermSpans(postings, term); + } else { + if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), textBytes) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + term.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run SpanTermQuery (term=" + term.text() + ")"); + } else { + // term does not exist + return TermSpans.EMPTY_TERM_SPANS; + } + } } - } Index: src/java/org/apache/lucene/search/spans/TermSpans.java =================================================================== --- src/java/org/apache/lucene/search/spans/TermSpans.java (revision 931099) +++ src/java/org/apache/lucene/search/spans/TermSpans.java (working copy) @@ -17,7 +17,9 @@ import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.Collections; @@ -28,49 +30,53 @@ * Public for extension only */ public class TermSpans extends Spans { - protected TermPositions positions; - protected Term term; + protected final DocsAndPositionsEnum postings; + protected final Term term; protected int doc; protected int freq; protected int count; protected int position; - - public TermSpans(TermPositions positions, Term term) throws IOException { - - this.positions = positions; + public TermSpans(DocsAndPositionsEnum postings, Term term) throws IOException { + this.postings = postings; this.term = term; doc = -1; } + // only for EmptyTermSpans (below) + TermSpans() { + term = null; + postings = null; + } + @Override public boolean next() throws IOException { if (count == freq) { - if (!positions.next()) { - doc = Integer.MAX_VALUE; + if (postings == null) { return false; } - doc = positions.doc(); - freq = positions.freq(); + doc = postings.nextDoc(); + if (doc == DocsAndPositionsEnum.NO_MORE_DOCS) { + return false; + } + freq = postings.freq(); count = 0; } - position = positions.nextPosition(); + position = postings.nextPosition(); count++; return true; } @Override public boolean skipTo(int target) throws IOException { - if (!positions.skipTo(target)) { - doc = Integer.MAX_VALUE; + doc = postings.advance(target); + if (doc == DocsAndPositionsEnum.NO_MORE_DOCS) { return false; } - doc = positions.doc(); - freq = positions.freq(); + freq = postings.freq(); count = 0; - - position = positions.nextPosition(); + position = postings.nextPosition(); count++; return true; @@ -94,15 +100,21 @@ // TODO: Remove warning after API has been finalized @Override public Collection getPayload() throws IOException { - byte [] bytes = new byte[positions.getPayloadLength()]; - bytes = positions.getPayload(bytes, 0); + final BytesRef payload = postings.getPayload(); + final byte[] bytes; + if (payload != null) { + bytes = new byte[payload.length]; + System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length); + } else { + bytes = null; + } return Collections.singletonList(bytes); } // TODO: Remove warning after API has been finalized @Override public boolean isPayloadAvailable() { - return positions.isPayloadAvailable(); + return postings.hasPayload(); } @Override @@ -111,8 +123,47 @@ (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position); } + public DocsAndPositionsEnum getPostings() { + return postings; + } - public TermPositions getPositions() { - return positions; + private static final class EmptyTermSpans extends TermSpans { + + @Override + public boolean next() { + return false; + } + + @Override + public boolean skipTo(int target) { + return false; + } + + @Override + public int doc() { + return DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public int start() { + return -1; + } + + @Override + public int end() { + return -1; + } + + @Override + public Collection getPayload() { + return null; + } + + @Override + public boolean isPayloadAvailable() { + return false; + } } + + public static final TermSpans EMPTY_TERM_SPANS = new EmptyTermSpans(); } Index: src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- src/java/org/apache/lucene/search/TermQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -20,8 +20,10 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; @@ -71,12 +73,14 @@ @Override public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { - TermDocs termDocs = reader.termDocs(term); - - if (termDocs == null) + // NOTE: debateably, the caller should never pass in a + // multi reader... + DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + if (docs == null) { return null; + } - return new TermScorer(this, termDocs, similarity, reader.norms(term.field())); + return new TermScorer(this, docs, similarity, reader.norms(term.field())); } @Override @@ -114,15 +118,12 @@ Explanation tfExplanation = new Explanation(); int tf = 0; - TermDocs termDocs = reader.termDocs(term); - if (termDocs != null) { - try { - if (termDocs.skipTo(doc) && termDocs.doc() == doc) { - tf = termDocs.freq(); + DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + if (docs != null) { + int newDoc = docs.advance(doc); + if (newDoc == doc) { + tf = docs.freq(); } - } finally { - termDocs.close(); - } tfExplanation.setValue(similarity.tf(tf)); tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); } else { Index: src/java/org/apache/lucene/search/TermRangeFilter.java =================================================================== --- src/java/org/apache/lucene/search/TermRangeFilter.java (revision 931099) +++ src/java/org/apache/lucene/search/TermRangeFilter.java (working copy) @@ -87,9 +87,6 @@ public static TermRangeFilter More(String fieldName, String lowerTerm) { return new TermRangeFilter(fieldName, lowerTerm, null, true, false); } - - /** Returns the field name for this filter */ - public String getField() { return query.getField(); } /** Returns the lower value of this range filter */ public String getLowerTerm() { return query.getLowerTerm(); } Index: src/java/org/apache/lucene/search/TermRangeQuery.java =================================================================== --- src/java/org/apache/lucene/search/TermRangeQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/TermRangeQuery.java (working copy) @@ -21,6 +21,9 @@ import java.text.Collator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; /** @@ -41,7 +44,6 @@ private String lowerTerm; private String upperTerm; private Collator collator; - private String field; private boolean includeLower; private boolean includeUpper; @@ -104,7 +106,7 @@ */ public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, Collator collator) { - this.field = field; + super(field); this.lowerTerm = lowerTerm; this.upperTerm = upperTerm; this.includeLower = includeLower; @@ -112,9 +114,6 @@ this.collator = collator; } - /** Returns the field name for this query */ - public String getField() { return field; } - /** Returns the lower value of this range query */ public String getLowerTerm() { return lowerTerm; } @@ -130,12 +129,33 @@ /** Returns the collator used to determine range inclusion, if any. */ public Collator getCollator() { return collator; } - @Override + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new TermRangeTermEnum(reader, field, lowerTerm, upperTerm, includeLower, includeUpper, collator); } + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) { + return TermsEnum.EMPTY; + } + if ((lowerTerm == null || (collator == null && includeLower && "".equals(lowerTerm))) && upperTerm == null) { + // NOTE: debateably, the caller should never pass in a + // multi reader... + final Terms terms = MultiFields.getTerms(reader, field); + return (terms != null) ? terms.iterator() : null; + } + return new TermRangeTermsEnum(reader, field, + lowerTerm, upperTerm, includeLower, includeUpper, collator); + } + + /** @deprecated */ + @Deprecated + public String field() { + return getField(); + } + /** Prints a user-readable version of this query. */ @Override public String toString(String field) { @@ -158,7 +178,6 @@ final int prime = 31; int result = super.hashCode(); result = prime * result + ((collator == null) ? 0 : collator.hashCode()); - result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + (includeLower ? 1231 : 1237); result = prime * result + (includeUpper ? 1231 : 1237); result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode()); @@ -180,11 +199,6 @@ return false; } else if (!collator.equals(other.collator)) return false; - if (field == null) { - if (other.field != null) - return false; - } else if (!field.equals(other.field)) - return false; if (includeLower != other.includeLower) return false; if (includeUpper != other.includeUpper) Index: src/java/org/apache/lucene/search/TermRangeTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/TermRangeTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/TermRangeTermEnum.java (working copy) @@ -31,7 +31,9 @@ * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * @since 2.9 + * @deprecated Please switch to {@link TermRangeTermsEnum} */ +@Deprecated public class TermRangeTermEnum extends FilteredTermEnum { private Collator collator = null; Index: src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- src/java/org/apache/lucene/search/TermScorer.java (revision 931099) +++ src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -19,25 +19,26 @@ import java.io.IOException; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.DocsEnum; /** Expert: A Scorer for documents matching a Term. */ final class TermScorer extends Scorer { - private Weight weight; - private TermDocs termDocs; + private DocsEnum docsEnum; private byte[] norms; private float weightValue; private int doc = -1; + private int freq; - private final int[] docs = new int[32]; // buffered doc numbers - private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; + private int[] docs; + private int[] freqs; + private final DocsEnum.BulkReadResult bulkResult; /** * Construct a TermScorer. @@ -52,13 +53,14 @@ * @param norms * The field norms of the document fields for the Term. */ - TermScorer(Weight weight, TermDocs td, Similarity similarity, byte[] norms) { + TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { super(similarity); this.weight = weight; - this.termDocs = td; + this.docsEnum = td; this.norms = norms; this.weightValue = weight.getValue(); + bulkResult = td.getBulkResult(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; @@ -69,62 +71,69 @@ score(c, Integer.MAX_VALUE, nextDoc()); } + private final void refillBuffer() throws IOException { + pointerMax = docsEnum.read(); // refill + docs = bulkResult.docs.ints; + freqs = bulkResult.freqs.ints; + } + // firstDocID is ignored since nextDoc() sets 'doc' @Override protected boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); while (doc < end) { // for docs in window c.collect(doc); // collect score - if (++pointer >= pointerMax) { - pointerMax = termDocs.read(docs, freqs); // refill buffers + refillBuffer(); if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream - doc = Integer.MAX_VALUE; // set to sentinel value + doc = NO_MORE_DOCS; // set to sentinel value return false; } } doc = docs[pointer]; + freq = freqs[pointer]; } return true; } @Override - public int docID() { return doc; } + public int docID() { + return doc; + } /** * Advances to the next document matching the query.
    * The iterator over the matching documents is buffered using * {@link TermDocs#read(int[],int[])}. * - * @return the document matching the query or -1 if there are no more documents. + * @return the document matching the query or NO_MORE_DOCS if there are no more documents. */ @Override public int nextDoc() throws IOException { pointer++; if (pointer >= pointerMax) { - pointerMax = termDocs.read(docs, freqs); // refill buffer + refillBuffer(); if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream return doc = NO_MORE_DOCS; } } doc = docs[pointer]; + freq = freqs[pointer]; + assert doc != NO_MORE_DOCS; return doc; } @Override public float score() { - assert doc != -1; - int f = freqs[pointer]; + assert doc != NO_MORE_DOCS; float raw = // compute tf(f)*weight - f < SCORE_CACHE_SIZE // check cache - ? scoreCache[f] // cache hit - : getSimilarity().tf(f)*weightValue; // cache miss + freq < SCORE_CACHE_SIZE // check cache + ? scoreCache[freq] // cache hit + : getSimilarity().tf(freq)*weightValue; // cache miss return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field } @@ -132,34 +141,34 @@ /** * Advances to the first match beyond the current whose document number is * greater than or equal to a given target.
    - * The implementation uses {@link TermDocs#skipTo(int)}. + * The implementation uses {@link DocsEnum#advance(int)}. * * @param target * The target document number. - * @return the matching document or -1 if none exist. + * @return the matching document or NO_MORE_DOCS if none exist. */ @Override public int advance(int target) throws IOException { // first scan in cache for (pointer++; pointer < pointerMax; pointer++) { if (docs[pointer] >= target) { + freq = freqs[pointer]; return doc = docs[pointer]; } } - // not found in cache, seek underlying stream - boolean result = termDocs.skipTo(target); - if (result) { - pointerMax = 1; - pointer = 0; - docs[pointer] = doc = termDocs.doc(); - freqs[pointer] = termDocs.freq(); + // not found in readahead cache, seek underlying stream + int newDoc = docsEnum.advance(target); + //System.out.println("ts.advance docsEnum=" + docsEnum); + if (newDoc != DocsEnum.NO_MORE_DOCS) { + doc = newDoc; + freq = docsEnum.freq(); } else { doc = NO_MORE_DOCS; } return doc; } - + /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } Index: src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- src/java/org/apache/lucene/search/WildcardQuery.java (revision 931099) +++ src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -19,101 +19,95 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, - * a Wildcard term should not start with one of the wildcards * or - * ?. + * a Wildcard term should not start with the wildcard * * *

    This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * - * @see WildcardTermEnum */ -public class WildcardQuery extends MultiTermQuery { - private boolean termContainsWildcard; - private boolean termIsPrefix; - protected Term term; - + * @see AutomatonQuery + */ +public class WildcardQuery extends AutomatonQuery { + /** String equality with support for wildcards */ + public static final char WILDCARD_STRING = '*'; + + /** Char equality with support for wildcards */ + public static final char WILDCARD_CHAR = '?'; + + /** + * Constructs a query for terms matching term. + */ public WildcardQuery(Term term) { - this.term = term; - String text = term.text(); - this.termContainsWildcard = (text.indexOf('*') != -1) - || (text.indexOf('?') != -1); - this.termIsPrefix = termContainsWildcard - && (text.indexOf('?') == -1) - && (text.indexOf('*') == text.length() - 1); + super(term, toAutomaton(term)); } - - @Override + + /** + * Convert Lucene wildcard syntax into an automaton. + */ + static Automaton toAutomaton(Term wildcardquery) { + List automata = new ArrayList(); + + String wildcardText = wildcardquery.text(); + + for (int i = 0; i < wildcardText.length(); i++) { + final char c = wildcardText.charAt(i); + switch(c) { + case WILDCARD_STRING: + automata.add(BasicAutomata.makeAnyString()); + break; + case WILDCARD_CHAR: + automata.add(BasicAutomata.makeAnyChar()); + break; + default: + automata.add(BasicAutomata.makeChar(c)); + } + } + + return BasicOperations.concatenate(automata); + } + + @Override @Deprecated protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - if (termContainsWildcard) - return new WildcardTermEnum(reader, getTerm()); - else - return new SingleTermEnum(reader, getTerm()); + return new WildcardTermEnum(reader, term); } + // we override this method, else backwards layer in MTQ will prefer getEnum! + @Override + protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return super.getTermsEnum(reader); + } + /** * Returns the pattern term. */ public Term getTerm() { return term; } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - if (termIsPrefix) { - MultiTermQuery rewritten = new PrefixQuery(term.createTerm(term.text() - .substring(0, term.text().indexOf('*')))); - rewritten.setBoost(getBoost()); - rewritten.setRewriteMethod(getRewriteMethod()); - return rewritten; - } else { - return super.rewrite(reader); - } - } /** Prints a user-readable version of this query. */ @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); - if (!term.field().equals(field)) { - buffer.append(term.field()); + if (!getField().equals(field)) { + buffer.append(getField()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } - - @Override - public int hashCode() { - final int prime = 31; - int result = super.hashCode(); - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (!super.equals(obj)) - return false; - if (getClass() != obj.getClass()) - return false; - WildcardQuery other = (WildcardQuery) obj; - if (term == null) { - if (other.term != null) - return false; - } else if (!term.equals(other.term)) - return false; - return true; - } - } Index: src/java/org/apache/lucene/search/WildcardTermEnum.java =================================================================== --- src/java/org/apache/lucene/search/WildcardTermEnum.java (revision 931099) +++ src/java/org/apache/lucene/search/WildcardTermEnum.java (working copy) @@ -28,7 +28,9 @@ *

    * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. + * @deprecated Please use {@link AutomatonTermsEnum} instead. */ +@Deprecated public class WildcardTermEnum extends FilteredTermEnum { final Term searchTerm; final String field; @@ -91,8 +93,8 @@ * String equality with support for wildcards ********************************************/ - public static final char WILDCARD_STRING = '*'; - public static final char WILDCARD_CHAR = '?'; + public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING; + public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR; /** * Determines if a word matches a wildcard pattern. Index: src/java/org/apache/lucene/store/Directory.java =================================================================== --- src/java/org/apache/lucene/store/Directory.java (revision 931099) +++ src/java/org/apache/lucene/store/Directory.java (working copy) @@ -19,15 +19,9 @@ import java.io.IOException; import java.io.Closeable; -import java.util.Collection; -import java.util.Collections; +import java.util.Collection; // for javadocs -import java.util.ArrayList; -import static java.util.Arrays.asList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import org.apache.lucene.index.IndexFileNameFilter; +import java.util.Arrays; import org.apache.lucene.util.IOUtils; /** A Directory is a flat list of files. Files may be written once, when they @@ -200,14 +194,7 @@ * @param to destination directory */ public final void copyTo(Directory to) throws IOException { - List filenames = new ArrayList(); - IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); - - for (String name : listAll()) - if (filter.accept(null, name)) - filenames.add(name); - - copyTo(to, filenames); + copyTo(to, Arrays.asList(listAll())); } /** Index: src/java/org/apache/lucene/store/IndexInput.java =================================================================== --- src/java/org/apache/lucene/store/IndexInput.java (revision 931099) +++ src/java/org/apache/lucene/store/IndexInput.java (working copy) @@ -17,187 +17,14 @@ * limitations under the License. */ -import java.io.IOException; import java.io.Closeable; -import java.util.Map; -import java.util.HashMap; +import java.io.IOException; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ -public abstract class IndexInput implements Cloneable,Closeable { - private byte[] bytes; // used by readString() - private char[] chars; // used by readModifiedUTF8String() - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - - /** Reads and returns a single byte. - * @see IndexOutput#writeByte(byte) - */ - public abstract byte readByte() throws IOException; - - /** Reads a specified number of bytes into an array at the specified offset. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @see IndexOutput#writeBytes(byte[],int) - */ - public abstract void readBytes(byte[] b, int offset, int len) - throws IOException; - - /** Reads a specified number of bytes into an array at the - * specified offset with control over whether the read - * should be buffered (callers who have their own buffer - * should pass in "false" for useBuffer). Currently only - * {@link BufferedIndexInput} respects this parameter. - * @param b the array to read bytes into - * @param offset the offset in the array to start storing bytes - * @param len the number of bytes to read - * @param useBuffer set to false if the caller will handle - * buffering. - * @see IndexOutput#writeBytes(byte[],int) - */ - public void readBytes(byte[] b, int offset, int len, boolean useBuffer) - throws IOException - { - // Default to ignoring useBuffer entirely - readBytes(b, offset, len); - } - - /** Reads four bytes and returns an int. - * @see IndexOutput#writeInt(int) - */ - public int readInt() throws IOException { - return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) - | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); - } - - /** Reads an int stored in variable-length format. Reads between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexOutput#writeVInt(int) - */ - public int readVInt() throws IOException { - byte b = readByte(); - int i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7F) << shift; - } - return i; - } - - /** Reads eight bytes and returns a long. - * @see IndexOutput#writeLong(long) - */ - public long readLong() throws IOException { - return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); - } - - /** Reads a long stored in variable-length format. Reads between one and - * nine bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. */ - public long readVLong() throws IOException { - byte b = readByte(); - long i = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = readByte(); - i |= (b & 0x7FL) << shift; - } - return i; - } - - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - - /** Reads a string. - * @see IndexOutput#writeString(String) - */ - public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); - int length = readVInt(); - if (bytes == null || length > bytes.length) { - bytes = new byte[ArrayUtil.oversize(length, 1)]; - } - readBytes(bytes, 0, length); - return new String(bytes, 0, length, "UTF-8"); - } - - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - if (chars == null || length > chars.length) { - chars = new char[ArrayUtil.oversize(length, RamUsageEstimator.NUM_BYTES_CHAR)]; - } - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see IndexOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - @Deprecated - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else { - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - } - - /** - * Expert - * - * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still - * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything - * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine - * how many more bytes to read - * @param length The number of chars to read - * @deprecated this method operates on old "modified utf8" encoded - * strings - */ - @Deprecated - public void skipChars(int length) throws IOException{ - for (int i = 0; i < length; i++) { - byte b = readByte(); - if ((b & 0x80) == 0){ - //do nothing, we only need one byte - } else if ((b & 0xE0) != 0xE0) { - readByte();//read an additional byte - } else { - //read two additional bytes. - readByte(); - readByte(); - } - } - } - - +public abstract class IndexInput extends DataInput implements Cloneable,Closeable { /** Closes the stream to further operations. */ public abstract void close() throws IOException; @@ -214,38 +41,4 @@ /** The number of bytes in the file. */ public abstract long length(); - - /** Returns a clone of this stream. - * - *

    Clones of a stream access the same data, and are positioned at the same - * point as the stream they were cloned from. - * - *

    Expert: Subclasses must ensure that clones may be positioned at - * different points in the input from each other and from the stream they - * were cloned from. - */ - @Override - public Object clone() { - IndexInput clone = null; - try { - clone = (IndexInput)super.clone(); - } catch (CloneNotSupportedException e) {} - - clone.bytes = null; - clone.chars = null; - - return clone; - } - - public Map readStringStringMap() throws IOException { - final Map map = new HashMap(); - final int count = readInt(); - for(int i=0;i> 24)); - writeByte((byte)(i >> 16)); - writeByte((byte)(i >> 8)); - writeByte((byte) i); - } - - /** Writes an int in a variable-length format. Writes between one and - * five bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVInt() - */ - public void writeVInt(int i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a long as eight bytes. - * @see IndexInput#readLong() - */ - public void writeLong(long i) throws IOException { - writeInt((int) (i >> 32)); - writeInt((int) i); - } - - /** Writes an long in a variable-length format. Writes between one and five - * bytes. Smaller values take fewer bytes. Negative numbers are not - * supported. - * @see IndexInput#readVLong() - */ - public void writeVLong(long i) throws IOException { - while ((i & ~0x7F) != 0) { - writeByte((byte)((i & 0x7f) | 0x80)); - i >>>= 7; - } - writeByte((byte)i); - } - - /** Writes a string. - * @see IndexInput#readString() - */ - public void writeString(String s) throws IOException { - UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); - writeVInt(utf8Result.length); - writeBytes(utf8Result.result, 0, utf8Result.length); - } - - /** Writes a sub sequence of characters from s as the old - * format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes - * instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(String s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - /** Writes a sub sequence of characters from char[] as - * the old format (modified UTF-8 encoded bytes). - * @param s the source of the characters - * @param start the first character in the sequence - * @param length the number of characters in the sequence - * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} - */ - @Deprecated - public void writeChars(char[] s, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - final int code = s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); - } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); - } - } - } - - private static int COPY_BUFFER_SIZE = 16384; - private byte[] copyBuffer; - - /** Copy numBytes bytes from input to ourself. */ - public void copyBytes(IndexInput input, long numBytes) throws IOException { - assert numBytes >= 0: "numBytes=" + numBytes; - long left = numBytes; - if (copyBuffer == null) - copyBuffer = new byte[COPY_BUFFER_SIZE]; - while(left > 0) { - final int toCopy; - if (left > COPY_BUFFER_SIZE) - toCopy = COPY_BUFFER_SIZE; - else - toCopy = (int) left; - input.readBytes(copyBuffer, 0, toCopy); - writeBytes(copyBuffer, 0, toCopy); - left -= toCopy; - } - } - /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; @@ -211,16 +58,4 @@ * @param length file length */ public void setLength(long length) throws IOException {} - - public void writeStringStringMap(Map map) throws IOException { - if (map == null) { - writeInt(0); - } else { - writeInt(map.size()); - for(final Map.Entry entry: map.entrySet()) { - writeString(entry.getKey()); - writeString(entry.getValue()); - } - } - } } Index: src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- src/java/org/apache/lucene/util/ArrayUtil.java (revision 931099) +++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -232,6 +232,29 @@ return currentSize; } + public static short[] grow(short[] array, int minSize) { + if (array.length < minSize) { + short[] newArray = new short[oversize(minSize, RamUsageEstimator.NUM_BYTES_SHORT)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static short[] grow(short[] array) { + return grow(array, 1 + array.length); + } + + public static short[] shrink(short[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); + if (newSize != array.length) { + short[] newArray = new short[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static int[] grow(int[] array, int minSize) { if (array.length < minSize) { int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)]; Property changes on: src\java\org\apache\lucene\util\automaton ___________________________________________________________________ Added: svn:ignore + moman Index: src/java/org/apache/lucene/util/BitVector.java =================================================================== --- src/java/org/apache/lucene/util/BitVector.java (revision 931099) +++ src/java/org/apache/lucene/util/BitVector.java (working copy) @@ -32,7 +32,7 @@

  • store and load, as bit set or d-gaps, depending on sparseness;
  • */ -public final class BitVector implements Cloneable { +public final class BitVector implements Cloneable, Bits { private byte[] bits; private int size; @@ -110,6 +110,11 @@ return size; } + // @Override -- not until Java 1.6 + public int length() { + return size; + } + /** Returns the total number of one bits in this vector. This is efficiently computed and cached, so that, if the vector is not changed, no recomputation is done for repeated calls. */ Index: src/java/org/apache/lucene/util/NumericUtils.java =================================================================== --- src/java/org/apache/lucene/util/NumericUtils.java (revision 931099) +++ src/java/org/apache/lucene/util/NumericUtils.java (working copy) @@ -22,6 +22,8 @@ import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs +// TODO: Remove the commented out methods before release! + /** * This is a helper class to generate prefix-encoded representations for numerical values * and supplies converters to represent float/double values as sortable integers/longs. @@ -32,10 +34,10 @@ * more exactly. This reduces the number of terms dramatically. * *

    This class generates terms to achieve this: First the numerical integer values need to - * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned - * and the bits are converted to ASCII chars with each 7 bit. The resulting string is - * sortable like the original integer value. Each value is also prefixed - * (in the first char) by the shift value (number of bits removed) used + * be converted to bytes. For that integer values (32 bit or 64 bit) are made unsigned + * and the bits are converted to ASCII chars with each 7 bit. The resulting byte[] is + * sortable like the original integer value (even using UTF-8 sort order). Each value is also + * prefixed (in the first char) by the shift value (number of bits removed) used * during encoding. * *

    To also index floating point numbers, this class supplies two methods to convert them @@ -51,13 +53,12 @@ * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part * for the same data types. * - *

    This class can also be used, to generate lexicographically sortable (according - * {@link String#compareTo(String)}) representations of numeric data types for other - * usages (e.g. sorting). + *

    This class can also be used, to generate lexicographically sortable (according to + * {@link BytesRef#getUTF8SortedAsUTF16Comparator()}) representations of numeric data + * types for other usages (e.g. sorting). * - * @lucene.experimental - * - * @since 2.9 + * @lucene.internal + * @since 2.9, API changed non backwards-compliant in 3.1 */ public final class NumericUtils { @@ -70,126 +71,150 @@ public static final int PRECISION_STEP_DEFAULT = 4; /** - * Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is - * stored as SHIFT_START_LONG+shift in the first character + * Longs are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_LONG+shift in the first byte */ - public static final char SHIFT_START_LONG = (char)0x20; + public static final byte SHIFT_START_LONG = 0x20; /** - * Expert: The maximum term length (used for char[] buffer size) + * The maximum term length (used for byte[] buffer size) * for encoding long values. - * @see #longToPrefixCoded(long,int,char[]) + * @see #longToPrefixCoded(long,int,BytesRef) */ public static final int BUF_SIZE_LONG = 63/7 + 2; /** - * Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is - * stored as SHIFT_START_INT+shift in the first character + * Integers are stored at lower precision by shifting off lower bits. The shift count is + * stored as SHIFT_START_INT+shift in the first byte */ - public static final char SHIFT_START_INT = (char)0x60; + public static final byte SHIFT_START_INT = 0x60; /** - * Expert: The maximum term length (used for char[] buffer size) + * The maximum term length (used for byte[] buffer size) * for encoding int values. - * @see #intToPrefixCoded(int,int,char[]) + * @see #intToPrefixCoded(int,int,BytesRef) */ public static final int BUF_SIZE_INT = 31/7 + 2; /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG} - * length - * @return number of chars written to buffer + * @param bytes will contain the encoded value + * @return the hash code for indexing (TermsHash) */ - public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) { + public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) { if (shift>63 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..63"); - int nChars = (63-shift)/7 + 1, len = nChars+1; - buffer[0] = (char)(SHIFT_START_LONG + shift); + if (bytes.bytes == null) { + bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG]; + } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_LONG) { + bytes.grow(NumericUtils.BUF_SIZE_LONG); + } + int hash, nChars = (63-shift)/7 + 1; + bytes.length = nChars+1; + bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift)); long sortableBits = val ^ 0x8000000000000000L; sortableBits >>>= shift; - while (nChars>=1) { - // Store 7 bits per character for good efficiency when UTF-8 encoding. - // The whole number is right-justified so that lucene can prefix-encode - // the terms more efficiently. - buffer[nChars--] = (char)(sortableBits & 0x7f); + while (nChars > 0) { + // Store 7 bits per byte for compatibility + // with UTF-8 encoding of terms + bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f); sortableBits >>>= 7; } - return len; + // calculate hash + for (int i = 1; i < bytes.length; i++) { + hash = 31*hash + bytes.bytes[i]; + } + return hash; } /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link LongRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String longToPrefixCoded(final long val, final int shift) { - final char[] buffer = new char[BUF_SIZE_LONG]; - final int len = longToPrefixCoded(val, shift, buffer); - return new String(buffer, 0, len); - } + final BytesRef buffer = new BytesRef(BUF_SIZE_LONG); + longToPrefixCoded(val, shift, buffer); + return buffer.utf8ToString(); + }*/ /** * This is a convenience method, that returns prefix coded bits of a long without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToLong}. - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String longToPrefixCoded(final long val) { return longToPrefixCoded(val, 0); - } + }*/ /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right - * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT} - * length - * @return number of chars written to buffer + * @param bytes will contain the encoded value + * @return the hash code for indexing (TermsHash) */ - public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { + public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) { if (shift>31 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..31"); - int nChars = (31-shift)/7 + 1, len = nChars+1; - buffer[0] = (char)(SHIFT_START_INT + shift); + if (bytes.bytes == null) { + bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT]; + } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) { + bytes.grow(NumericUtils.BUF_SIZE_INT); + } + int hash, nChars = (31-shift)/7 + 1; + bytes.length = nChars+1; + bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift)); int sortableBits = val ^ 0x80000000; sortableBits >>>= shift; - while (nChars>=1) { - // Store 7 bits per character for good efficiency when UTF-8 encoding. - // The whole number is right-justified so that lucene can prefix-encode - // the terms more efficiently. - buffer[nChars--] = (char)(sortableBits & 0x7f); + while (nChars > 0) { + // Store 7 bits per byte for compatibility + // with UTF-8 encoding of terms + bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f); sortableBits >>>= 7; } - return len; + // calculate hash + for (int i = 1; i < bytes.length; i++) { + hash = 31*hash + bytes.bytes[i]; + } + return hash; } /** - * Expert: Returns prefix coded bits after reducing the precision by shift bits. + * Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link IntRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String intToPrefixCoded(final int val, final int shift) { - final char[] buffer = new char[BUF_SIZE_INT]; - final int len = intToPrefixCoded(val, shift, buffer); - return new String(buffer, 0, len); - } + final BytesRef buffer = new BytesRef(BUF_SIZE_INT); + intToPrefixCoded(val, shift, buffer); + return buffer.utf8ToString(); + }*/ /** * This is a convenience method, that returns prefix coded bits of an int without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToInt}. - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String intToPrefixCoded(final int val) { return intToPrefixCoded(val, 0); - } + }*/ /** * Returns a long from prefixCoded characters. @@ -198,51 +223,97 @@ * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #longToPrefixCoded(long) + * @deprecated This method is no longer needed! + * + @Deprecated + public static long prefixCodedToLong(final String prefixCoded) { + return prefixCodedToLong(new BytesRef(prefixCoded)); + }*/ + + /** + * Returns the shift value from a prefix encoded {@code long}. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. */ - public static long prefixCodedToLong(final String prefixCoded) { - final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG; - if (shift>63 || shift<0) - throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)"); + public static int getPrefixCodedLongShift(final BytesRef val) { + final int shift = val.bytes[val.offset] - SHIFT_START_LONG; + if (shift > 63 || shift < 0) + throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); + return shift; + } + + /** + * Returns the shift value from a prefix encoded {@code int}. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + */ + public static int getPrefixCodedIntShift(final BytesRef val) { + final int shift = val.bytes[val.offset] - SHIFT_START_INT; + if (shift > 31 || shift < 0) + throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); + return shift; + } + + /** + * Returns a long from prefixCoded bytes. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode a term's value. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + * @see #longToPrefixCoded(long,int,BytesRef) + */ + public static long prefixCodedToLong(final BytesRef val) { long sortableBits = 0L; - for (int i=1, len=prefixCoded.length(); i0x7f) { + final byte b = val.bytes[i]; + if (b < 0) { throw new NumberFormatException( - "Invalid prefixCoded numerical value representation (char "+ - Integer.toHexString(ch)+" at position "+i+" is invalid)" + "Invalid prefixCoded numerical value representation (byte "+ + Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" ); } - sortableBits |= ch; + sortableBits |= b; } - return (sortableBits << shift) ^ 0x8000000000000000L; + return (sortableBits << getPrefixCodedLongShift(val)) ^ 0x8000000000000000L; } /** * Returns an int from prefixCoded characters. * Rightmost bits will be zero for lower precision codes. - * This method can be used to decode e.g. a stored field. + * This method can be used to decode a term's value. * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #intToPrefixCoded(int) + * @deprecated This method is no longer needed! + * + @Deprecated + public static int prefixCodedToInt(final String prefixCoded) { + return prefixCodedToInt(new BytesRef(prefixCoded)); + }*/ + + /** + * Returns an int from prefixCoded bytes. + * Rightmost bits will be zero for lower precision codes. + * This method can be used to decode a term's value. + * @throws NumberFormatException if the supplied {@link BytesRef} is + * not correctly prefix encoded. + * @see #intToPrefixCoded(int,int,BytesRef) */ - public static int prefixCodedToInt(final String prefixCoded) { - final int shift = prefixCoded.charAt(0)-SHIFT_START_INT; - if (shift>31 || shift<0) - throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)"); + public static int prefixCodedToInt(final BytesRef val) { int sortableBits = 0; - for (int i=1, len=prefixCoded.length(); i0x7f) { + final byte b = val.bytes[i]; + if (b < 0) { throw new NumberFormatException( - "Invalid prefixCoded numerical value representation (char "+ - Integer.toHexString(ch)+" at position "+i+" is invalid)" + "Invalid prefixCoded numerical value representation (byte "+ + Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" ); } - sortableBits |= ch; + sortableBits |= b; } - return (sortableBits << shift) ^ 0x80000000; + return (sortableBits << getPrefixCodedIntShift(val)) ^ 0x80000000; } /** @@ -261,10 +332,12 @@ /** * Convenience method: this just returns: * longToPrefixCoded(doubleToSortableLong(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String doubleToPrefixCoded(double val) { return longToPrefixCoded(doubleToSortableLong(val)); - } + }*/ /** * Converts a sortable long back to a double. @@ -278,10 +351,12 @@ /** * Convenience method: this just returns: * sortableLongToDouble(prefixCodedToLong(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static double prefixCodedToDouble(String val) { return sortableLongToDouble(prefixCodedToLong(val)); - } + }*/ /** * Converts a float value to a sortable signed int. @@ -299,10 +374,12 @@ /** * Convenience method: this just returns: * intToPrefixCoded(floatToSortableInt(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static String floatToPrefixCoded(float val) { return intToPrefixCoded(floatToSortableInt(val)); - } + }*/ /** * Converts a sortable int back to a float. @@ -316,16 +393,18 @@ /** * Convenience method: this just returns: * sortableIntToFloat(prefixCodedToInt(val)) - */ + * @deprecated This method is no longer needed! + * + @Deprecated public static float prefixCodedToFloat(String val) { return sortableIntToFloat(prefixCodedToInt(val)); - } + }*/ /** - * Expert: Splits a long range recursively. + * Splits a long range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its - * {@link LongRangeBuilder#addRange(String,String)} + * {@link LongRangeBuilder#addRange(BytesRef,BytesRef)} * method. *

    This method is used by {@link NumericRangeQuery}. */ @@ -336,10 +415,10 @@ } /** - * Expert: Splits an int range recursively. + * Splits an int range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its - * {@link IntRangeBuilder#addRange(String,String)} + * {@link IntRangeBuilder#addRange(BytesRef,BytesRef)} * method. *

    This method is used by {@link NumericRangeQuery}. */ @@ -412,10 +491,10 @@ } /** - * Expert: Callback for {@link #splitLongRange}. + * Callback for {@link #splitLongRange}. * You need to overwrite only one of the methods. - *

    NOTE: This is a very low-level interface, - * the method signatures may change in later versions. + * @lucene.internal + * @since 2.9, API changed non backwards-compliant in 3.1 */ public static abstract class LongRangeBuilder { @@ -423,7 +502,7 @@ * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical (inclusive) range queries from them. */ - public void addRange(String minPrefixCoded, String maxPrefixCoded) { + public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { throw new UnsupportedOperationException(); } @@ -432,16 +511,19 @@ * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final long min, final long max, final int shift) { - addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift)); + final BytesRef minBytes = new BytesRef(BUF_SIZE_LONG), maxBytes = new BytesRef(BUF_SIZE_LONG); + longToPrefixCoded(min, shift, minBytes); + longToPrefixCoded(max, shift, maxBytes); + addRange(minBytes, maxBytes); } } /** - * Expert: Callback for {@link #splitIntRange}. + * Callback for {@link #splitIntRange}. * You need to overwrite only one of the methods. - *

    NOTE: This is a very low-level interface, - * the method signatures may change in later versions. + * @lucene.internal + * @since 2.9, API changed non backwards-compliant in 3.1 */ public static abstract class IntRangeBuilder { @@ -449,7 +531,7 @@ * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical range (inclusive) queries from them. */ - public void addRange(String minPrefixCoded, String maxPrefixCoded) { + public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { throw new UnsupportedOperationException(); } @@ -458,7 +540,10 @@ * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final int min, final int max, final int shift) { - addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift)); + final BytesRef minBytes = new BytesRef(BUF_SIZE_INT), maxBytes = new BytesRef(BUF_SIZE_INT); + intToPrefixCoded(min, shift, minBytes); + intToPrefixCoded(max, shift, maxBytes); + addRange(minBytes, maxBytes); } } Index: src/java/org/apache/lucene/util/OpenBitSet.java =================================================================== --- src/java/org/apache/lucene/util/OpenBitSet.java (revision 931099) +++ src/java/org/apache/lucene/util/OpenBitSet.java (working copy) @@ -75,7 +75,7 @@ */ -public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { +public class OpenBitSet extends DocIdSet implements Bits, Cloneable, Serializable { protected long[] bits; protected int wlen; // number of words (elements) used in the array @@ -132,6 +132,11 @@ return capacity(); } + // @Override -- not until Java 1.6 + public int length() { + return bits.length << 6; + } + /** Returns true if there are no set bits */ public boolean isEmpty() { return cardinality()==0; } Index: src/java/org/apache/lucene/util/RamUsageEstimator.java =================================================================== --- src/java/org/apache/lucene/util/RamUsageEstimator.java (revision 931099) +++ src/java/org/apache/lucene/util/RamUsageEstimator.java (working copy) @@ -37,6 +37,16 @@ * @lucene.internal */ public final class RamUsageEstimator { + + public final static int NUM_BYTES_SHORT = 2; + public final static int NUM_BYTES_INT = 4; + public final static int NUM_BYTES_LONG = 8; + public final static int NUM_BYTES_FLOAT = 4; + public final static int NUM_BYTES_DOUBLE = 8; + public final static int NUM_BYTES_OBJ_HEADER = 8; + public final static int NUM_BYTES_OBJ_REF = Constants.JRE_IS_64BIT ? 8 : 4; + public final static int NUM_BYTES_ARRAY_HEADER = NUM_BYTES_OBJ_HEADER + NUM_BYTES_INT + NUM_BYTES_OBJ_REF; + private MemoryModel memoryModel; private final Map seen; @@ -47,11 +57,6 @@ public final static int NUM_BYTES_OBJECT_REF = Constants.JRE_IS_64BIT ? 8 : 4; public final static int NUM_BYTES_CHAR = 2; - public final static int NUM_BYTES_SHORT = 2; - public final static int NUM_BYTES_INT = 4; - public final static int NUM_BYTES_LONG = 8; - public final static int NUM_BYTES_FLOAT = 4; - public final static int NUM_BYTES_DOUBLE = 8; private boolean checkInterned; Index: src/java/org/apache/lucene/util/ReaderUtil.java =================================================================== --- src/java/org/apache/lucene/util/ReaderUtil.java (revision 931099) +++ src/java/org/apache/lucene/util/ReaderUtil.java (working copy) @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.List; +import java.io.IOException; import org.apache.lucene.index.IndexReader; @@ -29,22 +30,85 @@ */ public class ReaderUtil { + public static class Slice { + public static final Slice[] EMPTY_ARRAY = new Slice[0]; + public final int start; + public final int length; + public final int readerIndex; + + public Slice(int start, int length, int readerIndex) { + this.start = start; + this.length = length; + this.readerIndex = readerIndex; + } + + public String toString() { + return "slice start=" + start + " length=" + length; + } + } + /** - * Gathers sub-readers from reader into a List. + * Gathers sub-readers from reader into a List. See + * {@link Gather} for are more general way to gather + * whatever you need to, per reader. + * + * @lucene.experimental * * @param allSubReaders * @param reader */ - public static void gatherSubReaders(List allSubReaders, IndexReader reader) { - IndexReader[] subReaders = reader.getSequentialSubReaders(); - if (subReaders == null) { - // Add the reader itself, and do not recurse - allSubReaders.add(reader); - } else { - for (int i = 0; i < subReaders.length; i++) { - gatherSubReaders(allSubReaders, subReaders[i]); + + public static void gatherSubReaders(final List allSubReaders, IndexReader reader) { + try { + new Gather(reader) { + @Override + protected void add(int base, IndexReader r) { + allSubReaders.add(r); + } + }.run(); + } catch (IOException ioe) { + // won't happen + throw new RuntimeException(ioe); + } + } + + /** Recursively visits all sub-readers of a reader. You + * should subclass this and override the add method to + * gather what you need. + * + * @lucene.experimental */ + public static abstract class Gather { + private final IndexReader topReader; + + public Gather(IndexReader r) { + topReader = r; + } + + public int run() throws IOException { + return run(0, topReader); + } + + public int run(int docBase) throws IOException { + return run(docBase, topReader); + } + + private int run(int base, IndexReader reader) throws IOException { + IndexReader[] subReaders = reader.getSequentialSubReaders(); + if (subReaders == null) { + // atomic reader + add(base, reader); + base += reader.maxDoc(); + } else { + // composite reader + for (int i = 0; i < subReaders.length; i++) { + base = run(base, subReaders[i]); + } } + + return base; } + + protected abstract void add(int base, IndexReader r) throws IOException; } /** Index: src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- src/java/org/apache/lucene/util/UnicodeUtil.java (revision 931099) +++ src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -74,30 +74,20 @@ /** * @lucene.internal */ - public static final class UTF8Result { - public byte[] result = new byte[10]; - public int length; - - public void setLength(int newLength) { - if (result.length < newLength) { - result = ArrayUtil.grow(result, newLength); - } - length = newLength; - } - } - - /** - * @lucene.internal - */ public static final class UTF16Result { public char[] result = new char[10]; public int[] offsets = new int[10]; public int length; + /* + public String toString() { + return new String(result, 0, length); + } + */ + public void setLength(int newLength) { - if (result.length < newLength) { + if (result.length < newLength) result = ArrayUtil.grow(result, newLength); - } length = newLength; } @@ -105,80 +95,89 @@ setLength(other.length); System.arraycopy(other.result, 0, result, 0, length); } + + public void copyText(String other) { + final int otherLength = other.length(); + setLength(otherLength); + other.getChars(0, otherLength, result, 0); + length = otherLength; + } } /** Encode characters from a char[] source, starting at - * offset and stopping when the character 0xffff is seen. - * Returns the number of bytes written to bytesOut. */ - public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) { - + * offset for length chars. Returns a hash of the resulting bytes */ + public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) { + int hash = 0; int upto = 0; int i = offset; - byte[] out = result.result; + final int end = offset + length; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; + result.offset = 0; - while(true) { + while(i < end) { final int code = (int) source[i++]; - if (upto+4 > out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); - } - if (code < 0x80) - out[upto++] = (byte) code; - else if (code < 0x800) { - out[upto++] = (byte) (0xC0 | (code >> 6)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); + if (code < 0x80) { + hash = 31*hash + (out[upto++] = (byte) code); + } else if (code < 0x800) { + hash = 31*hash + (out[upto++] = (byte) (0xC0 | (code >> 6))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F))); } else if (code < 0xD800 || code > 0xDFFF) { - if (code == 0xffff) - // END - break; - out[upto++] = (byte)(0xE0 | (code >> 12)); - out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); + hash = 31*hash + (out[upto++] = (byte)(0xE0 | (code >> 12))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F))); } else { // surrogate pair // confirm valid high surrogate - if (code < 0xDC00 && source[i] != 0xffff) { + if (code < 0xDC00 && i < end) { int utf32 = (int) source[i]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); i++; - out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); + hash = 31*hash + (out[upto++] = (byte)(0xF0 | (utf32 >> 18))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F))); + hash = 31*hash + (out[upto++] = (byte)(0x80 | (utf32 & 0x3F))); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character - out[upto++] = (byte) 0xEF; - out[upto++] = (byte) 0xBF; - out[upto++] = (byte) 0xBD; + hash = 31*hash + (out[upto++] = (byte) 0xEF); + hash = 31*hash + (out[upto++] = (byte) 0xBF); + hash = 31*hash + (out[upto++] = (byte) 0xBD); } } - //assert matches(source, offset, i-offset-1, out, upto); + //assert matches(source, offset, length, out, upto); result.length = upto; + return hash; } /** Encode characters from a char[] source, starting at * offset for length chars. Returns the number of bytes * written to bytesOut. */ - public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) { + public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) { int upto = 0; int i = offset; final int end = offset + length; - byte[] out = result.result; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; + result.offset = 0; while(i < end) { final int code = (int) source[i++]; - if (upto+4 > out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); - } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { @@ -191,7 +190,7 @@ } else { // surrogate pair // confirm valid high surrogate - if (code < 0xDC00 && i < end && source[i] != 0xffff) { + if (code < 0xDC00 && i < end) { int utf32 = (int) source[i]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { @@ -218,18 +217,20 @@ /** Encode characters from this String, starting at offset * for length characters. Returns the number of bytes * written to bytesOut. */ - public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) { + public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) { final int end = offset + length; - byte[] out = result.result; + byte[] out = result.bytes; + result.offset = 0; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) + out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)]; int upto = 0; for(int i=offset;i out.length) { - out = result.result = ArrayUtil.grow(out, upto+4); - } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { @@ -332,6 +333,71 @@ result.length = outUpto; } + /** + * Get the next valid UTF-16 String in UTF-16 order. + *

    + * If the input String is already valid, it is returned. + * Otherwise the next String in code unit order is returned. + *

    + * @param s input String (possibly with unpaired surrogates) + * @return next valid UTF-16 String in UTF-16 order + */ + public static String nextValidUTF16String(String s) { + if (validUTF16String(s)) + return s; + else { + UTF16Result chars = new UTF16Result(); + chars.copyText(s); + nextValidUTF16String(chars); + return new String(chars.result, 0, chars.length); + } + } + + public static void nextValidUTF16String(UTF16Result s) { + final int size = s.length; + for (int i = 0; i < size; i++) { + char ch = s.result[i]; + if (ch >= UnicodeUtil.UNI_SUR_HIGH_START + && ch <= UnicodeUtil.UNI_SUR_HIGH_END) { + if (i < size - 1) { + i++; + char nextCH = s.result[i]; + if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START + && nextCH <= UnicodeUtil.UNI_SUR_LOW_END) { + // Valid surrogate pair + } else + // Unmatched high surrogate + if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated + s.setLength(i + 1); + s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; + return; + } else { // SMP already enumerated + if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) { + s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); + s.setLength(i); + } else { + s.result[i - 1]++; + s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; + s.setLength(i + 1); + } + return; + } + } else { + // Unmatched high surrogate in final position, SMP not yet enumerated + s.setLength(i + 2); + s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START; + return; + } + } else if (ch >= UnicodeUtil.UNI_SUR_LOW_START + && ch <= UnicodeUtil.UNI_SUR_LOW_END) { + // Unmatched low surrogate, SMP already enumerated + s.setLength(i + 1); + s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); + return; + } + } + } + // Only called from assert /* private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) { @@ -386,8 +452,8 @@ return false; } } - - public static final boolean validUTF16String(String s) { + */ + public static final boolean validUTF16String(CharSequence s) { final int size = s.length(); for(int i=0;iBooleanWeight2 (link goes to ViewVC BooleanQuery java code which contains the BooleanWeight2 inner class) or BooleanWeight - (link goes to ViewVC BooleanQuery java code, which contains the BooleanWeight inner class). + (link goes to ViewVC BooleanQuery java code, which contains the BooleanWeight inner class) from the 1.4 version of Lucene is used by default. + See CHANGES.txt under release 1.9 RC1 for more information on choosing which Scorer to use.

    -

    +

    ry#setUseScorer14(boolean) Assuming the use of the BooleanWeight2, a BooleanScorer2 is created by bringing together all of the Property changes on: src\test\org\apache\lucene\analysis\TestISOLatin1AccentFilter.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r924732-924780,924782-925175,925463-925561 Reverse-merged /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r825998 Merged /lucene/java/branches/flex_1458/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java:r824912-931101 Index: src/test/org/apache/lucene/analysis/TestNumericTokenStream.java =================================================================== --- src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (revision 931099) +++ src/test/org/apache/lucene/analysis/TestNumericTokenStream.java (working copy) @@ -17,8 +17,9 @@ * limitations under the License. */ +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestNumericTokenStream extends BaseTokenStreamTestCase { @@ -29,27 +30,47 @@ public void testLongStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); + final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); + final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); + final BytesRef bytes = new BytesRef(); + stream.reset(); + assertEquals(64, numericAtt.getValueSize()); + assertEquals(lvalue, numericAtt.getRawValue()); for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); + assertEquals("Shift value wrong", shift, numericAtt.getShift()); + final int hash = bytesAtt.toBytesRef(bytes); + assertEquals("Hash incorrect", bytes.hashCode(), hash); + assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), NumericUtils.prefixCodedToLong(bytes)); + assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } - assertFalse("No more tokens available", stream.incrementToken()); + assertFalse("More tokens available", stream.incrementToken()); + stream.end(); + stream.close(); } public void testIntStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed - final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); + final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); + final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); + final BytesRef bytes = new BytesRef(); + stream.reset(); + assertEquals(32, numericAtt.getValueSize()); + assertEquals(ivalue, numericAtt.getRawValue()); for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); - assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), termAtt.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); + assertEquals("Shift value wrong", shift, numericAtt.getShift()); + final int hash = bytesAtt.toBytesRef(bytes); + assertEquals("Hash incorrect", bytes.hashCode(), hash); + assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes)); + assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } - assertFalse("No more tokens available", stream.incrementToken()); + assertFalse("More tokens available", stream.incrementToken()); + stream.end(); + stream.close(); } public void testNotInitialized() throws Exception { Index: src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpls.java =================================================================== --- src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpls.java (revision 931099) +++ src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpls.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.AttributeSource.AttributeFactory; +@Deprecated public class TestSimpleAttributeImpls extends LuceneTestCase { public TestSimpleAttributeImpls(String name) { Index: src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java =================================================================== --- src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java (revision 931099) +++ src/test/org/apache/lucene/analysis/tokenattributes/TestTermAttributeImpl.java (working copy) @@ -107,10 +107,10 @@ char[] b = {'a', 'l', 'o', 'h', 'a'}; TermAttributeImpl t = new TermAttributeImpl(); t.setTermBuffer(b, 0, 5); - assertEquals("term=aloha", t.toString()); + assertEquals("aloha", t.toString()); t.setTermBuffer("hi there"); - assertEquals("term=hi there", t.toString()); + assertEquals("hi there", t.toString()); } public void testMixedStringArray() throws Exception { Index: src/test/org/apache/lucene/document/TestDateTools.java =================================================================== --- src/test/org/apache/lucene/document/TestDateTools.java (revision 931099) +++ src/test/org/apache/lucene/document/TestDateTools.java (working copy) @@ -197,4 +197,4 @@ } } -} +} \ No newline at end of file Property changes on: src\test\org\apache\lucene\document\TestDateTools.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/src/test/org/apache/lucene/document/TestDateTools.java:r924732-924780,924782-925175,925463-925561 Reverse-merged /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestDateTools.java:r825998 Merged /lucene/java/branches/flex_1458/src/test/org/apache/lucene/document/TestDateTools.java:r824912-931101 Property changes on: src\test\org\apache\lucene\document\TestNumberTools.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestNumberTools.java:r825998 Reverse-merged /lucene/java/trunk/src/test/org/apache/lucene/document/TestNumberTools.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/flex_1458/src/test/org/apache/lucene/document/TestNumberTools.java:r824912-931101 Index: src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java =================================================================== --- src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 931099) +++ src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.search.PhraseQuery; @@ -47,6 +48,7 @@ addDocs(writer, 100); assertEquals(100, writer.maxDoc()); writer.close(); + _TestUtil.checkIndex(dir); writer = newWriter(aux, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.CREATE)); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(false); // use one without a compound file @@ -68,6 +70,7 @@ writer.addIndexesNoOptimize(new Directory[] { aux, aux2 }); assertEquals(190, writer.maxDoc()); writer.close(); + _TestUtil.checkIndex(dir); // make sure the old index is correct verifyNumDocs(aux, 40); @@ -128,12 +131,13 @@ public void testWithPendingDeletes() throws IOException { // main directory - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); // auxiliary directory - Directory aux = new RAMDirectory(); + Directory aux = new MockRAMDirectory(); setUpDirs(dir, aux); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.APPEND)); + writer.addIndexesNoOptimize(new Directory[] {aux}); // Adds 10 docs, then replaces them with another 10 Index: src/test/org/apache/lucene/index/TestAtomicUpdate.java =================================================================== --- src/test/org/apache/lucene/index/TestAtomicUpdate.java (revision 931099) +++ src/test/org/apache/lucene/index/TestAtomicUpdate.java (working copy) @@ -148,33 +148,34 @@ assertEquals(100, r.numDocs()); r.close(); + int upto = 0; + IndexerThread indexerThread = new IndexerThread(writer, threads); - threads[0] = indexerThread; + threads[upto++] = indexerThread; indexerThread.start(); - IndexerThread indexerThread2 = new IndexerThread(writer, threads); - threads[1] = indexerThread2; - indexerThread2.start(); + //IndexerThread indexerThread2 = new IndexerThread(writer, threads); + //threads[upto++] = indexerThread2; + //indexerThread2.start(); SearcherThread searcherThread1 = new SearcherThread(directory, threads); - threads[2] = searcherThread1; + threads[upto++] = searcherThread1; searcherThread1.start(); - SearcherThread searcherThread2 = new SearcherThread(directory, threads); - threads[3] = searcherThread2; - searcherThread2.start(); + //SearcherThread searcherThread2 = new SearcherThread(directory, threads); + //threads[upto++] = searcherThread2; + //searcherThread2.start(); - indexerThread.join(); - indexerThread2.join(); - searcherThread1.join(); - searcherThread2.join(); + for(int i=0;i= 3.0 + if (oldNames[i].compareTo("30.") < 0) continue; + + unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); + String fullPath = fullDir(oldNames[i]); + Directory dir = FSDirectory.open(new File(fullPath)); + IndexSearcher searcher = new IndexSearcher(dir, true); + + for (int id=10; id<15; id++) { + ScoreDoc[] hits = searcher.search(NumericRangeQuery.newIntRange("trieInt", 4, Integer.valueOf(id), Integer.valueOf(id), true, true), 100).scoreDocs; + assertEquals("wrong number of hits", 1, hits.length); + Document d = searcher.doc(hits[0].doc); + assertEquals(String.valueOf(id), d.get("id")); + + hits = searcher.search(NumericRangeQuery.newLongRange("trieLong", 4, Long.valueOf(id), Long.valueOf(id), true, true), 100).scoreDocs; + assertEquals("wrong number of hits", 1, hits.length); + d = searcher.doc(hits[0].doc); + assertEquals(String.valueOf(id), d.get("id")); + } + + // check that also lower-precision fields are ok + ScoreDoc[] hits = searcher.search(NumericRangeQuery.newIntRange("trieInt", 4, Integer.MIN_VALUE, Integer.MAX_VALUE, false, false), 100).scoreDocs; + assertEquals("wrong number of hits", 34, hits.length); + + hits = searcher.search(NumericRangeQuery.newLongRange("trieLong", 4, Long.MIN_VALUE, Long.MAX_VALUE, false, false), 100).scoreDocs; + assertEquals("wrong number of hits", 34, hits.length); + + // check decoding into field cache + int[] fci = FieldCache.DEFAULT.getInts(searcher.getIndexReader(), "trieInt"); + for (int val : fci) { + assertTrue("value in id bounds", val >= 0 && val < 35); + } + + long[] fcl = FieldCache.DEFAULT.getLongs(searcher.getIndexReader(), "trieLong"); + for (long val : fcl) { + assertTrue("value in id bounds", val >= 0L && val < 35L); + } + + searcher.close(); + dir.close(); + rmDir(oldNames[i]); + } + } + } Property changes on: src\test\org\apache\lucene\index\TestBackwardsCompatibility.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r825998 Reverse-merged /lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r924732-924780,924782-925175,925463-925561 Merged /lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r928290 Merged /lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:r824912-931101 Index: src/test/org/apache/lucene/index/TestDoc.java =================================================================== --- src/test/org/apache/lucene/index/TestDoc.java (revision 931099) +++ src/test/org/apache/lucene/index/TestDoc.java (working copy) @@ -36,6 +36,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.index.codecs.CodecProvider; /** JUnit adaptation of an older test case DocTest. */ @@ -185,20 +186,24 @@ SegmentReader r1 = SegmentReader.get(true, si1, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); SegmentReader r2 = SegmentReader.get(true, si2, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); - SegmentMerger merger = new SegmentMerger(si1.dir, merged); + SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, merged, null, CodecProvider.getDefault()); merger.add(r1); merger.add(r2); merger.merge(); merger.closeReaders(); + final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, + useCompoundFile, true, -1, null, false, merger.hasProx(), + merger.getCodec()); + if (useCompoundFile) { - List filesToDelete = merger.createCompoundFile(merged + ".cfs"); + List filesToDelete = merger.createCompoundFile(merged + ".cfs", info); for (final String fileToDelete : filesToDelete) si1.dir.deleteFile(fileToDelete); } - return new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, useCompoundFile, true); + return info; } Index: src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexReader.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -21,7 +21,6 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; @@ -42,6 +41,7 @@ import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexReader.FieldOption; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; @@ -881,15 +881,18 @@ d.add(new Field("id", Integer.toString(i), Field.Store.YES, Field.Index.NOT_ANALYZED)); d.add(new Field("content", "aaa " + i, Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(d); + if (0==i%10) + writer.commit(); } writer.close(); - long diskUsage = startDir.sizeInBytes(); - long diskFree = diskUsage+100; + long diskUsage = ((MockRAMDirectory) startDir).getRecomputedActualSizeInBytes(); + long diskFree = diskUsage+100; IOException err = null; boolean done = false; + boolean gotExc = false; // Iterate w/ ever increasing free disk space: while(!done) { @@ -946,7 +949,7 @@ int docId = 12; for(int i=0;i<13;i++) { reader.deleteDocument(docId); - reader.setNorm(docId, "contents", (float) 2.0); + reader.setNorm(docId, "content", (float) 2.0); docId += 12; } } @@ -961,6 +964,7 @@ e.printStackTrace(System.out); } err = e; + gotExc = true; if (1 == x) { e.printStackTrace(); fail(testName + " hit IOException after disk space was freed up"); @@ -973,30 +977,8 @@ // new IndexFileDeleter, have it delete // unreferenced files, then verify that in fact // no files were deleted: - String[] startFiles = dir.listAll(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); - String[] endFiles = dir.listAll(); + TestIndexWriter.assertNoUnreferencedFiles(dir, "reader.close() failed to delete unreferenced files"); - Arrays.sort(startFiles); - Arrays.sort(endFiles); - - //for(int i=0;i 0) { - s += "\n "; - } - s += l[i]; - } - return s; - } - public void testOpenReaderAfterDelete() throws IOException { File dirFile = new File(TEMP_DIR, "deletetest"); Directory dir = FSDirectory.open(dirFile); @@ -1410,7 +1383,7 @@ writer.close(); SegmentInfos sis = new SegmentInfos(); - sis.read(d); + sis.read(d, CodecProvider.getDefault()); IndexReader r = IndexReader.open(d, false); IndexCommit c = r.getIndexCommit(); @@ -1597,6 +1570,7 @@ // LUCENE-1579: Ensure that on a cloned reader, segments // reuse the doc values arrays in FieldCache public void testFieldCacheReuseAfterClone() throws Exception { + //Codec.DEBUG = true; Directory dir = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); Document doc = new Document(); @@ -1750,7 +1724,6 @@ } catch (IllegalStateException ise) { // expected } - assertFalse(((SegmentReader) r.getSequentialSubReaders()[0]).termsIndexLoaded()); assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); @@ -1763,10 +1736,13 @@ IndexReader[] subReaders = r2.getSequentialSubReaders(); assertEquals(2, subReaders.length); for(int i=0;i<2;i++) { - assertFalse(((SegmentReader) subReaders[i]).termsIndexLoaded()); + try { + subReaders[i].docFreq(new Term("field", "f")); + fail("did not hit expected exception"); + } catch (IllegalStateException ise) { + // expected + } } - r2.close(); - dir.close(); } // LUCENE-2046 Index: src/test/org/apache/lucene/index/TestIndexReaderReopen.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexReaderReopen.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexReaderReopen.java (working copy) @@ -863,6 +863,8 @@ assertReaderClosed(reader, true, true); assertReaderClosed(firstReader, true, true); + FlexTestUtil.verifyFlexVsPreFlex(rnd, dir); + dir.close(); } Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -23,11 +23,13 @@ import java.io.PrintStream; import java.io.Reader; import java.io.StringReader; +import java.util.List; import java.util.ArrayList; import java.util.Arrays; +import java.util.Set; +import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; @@ -49,6 +51,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; @@ -72,6 +75,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.BytesRef; public class TestIndexWriter extends LuceneTestCase { public TestIndexWriter(String name) { @@ -525,7 +529,7 @@ String[] startFiles = dir.listAll(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null, CodecProvider.getDefault()); String[] endFiles = dir.listAll(); Arrays.sort(startFiles); @@ -544,13 +548,12 @@ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT))); - char[] chars = new char[DocumentsWriter.CHAR_BLOCK_SIZE-1]; + char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8]; Arrays.fill(chars, 'x'); Document doc = new Document(); final String bigTerm = new String(chars); - // Max length term is 16383, so this contents produces - // a too-long term: + // This produces a too-long term: String contents = "abc xyz x" + bigTerm + " another term"; doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); @@ -3306,7 +3309,7 @@ // LUCENE-510 public void testAllUnicodeChars() throws Throwable { - UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); + BytesRef utf8 = new BytesRef(10); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); char[] chars = new char[2]; for(int ch=0;ch<0x0010FFFF;ch++) { @@ -3326,16 +3329,16 @@ UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); String s1 = new String(chars, 0, len); - String s2 = new String(utf8.result, 0, utf8.length, "UTF-8"); + String s2 = new String(utf8.bytes, 0, utf8.length, "UTF-8"); assertEquals("codepoint " + ch, s1, s2); - UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16); + UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16); assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length)); byte[] b = s1.getBytes("UTF-8"); assertEquals(utf8.length, b.length); for(int j=0;j allTerms, boolean isTop) throws IOException { + TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); + + char[] last = new char[2]; + int lastLength = 0; + + Set seenTerms = new HashSet(); + + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + while(true) { + final BytesRef term = terms.next(); + if (term == null) { + break; + } + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + assertTrue(utf16.length <= 2); + + // Make sure last term comes before current one, in + // UTF16 sort order + int i = 0; + for(i=0;i it = seenTerms.iterator(); + while(it.hasNext()) { + BytesRef tr = new BytesRef(it.next()); + assertEquals("seek failed for term=" + termDesc(tr.utf8ToString()), + TermsEnum.SeekStatus.FOUND, + terms.seek(tr)); + } + } + + private final String asUnicodeChar(char c) { + return "U+" + Integer.toHexString(c); + } + + private final String termDesc(String s) { + final String s0; + assertTrue(s.length() <= 2); + if (s.length() == 1) { + s0 = asUnicodeChar(s.charAt(0)); + } else { + s0 = asUnicodeChar(s.charAt(0)) + "," + asUnicodeChar(s.charAt(1)); + } + return s0; + } + + // Make sure terms, including ones with surrogate pairs, + // sort in UTF16 sort order by default + public void testTermUTF16SortOrder() throws Throwable { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + Document d = new Document(); + // Single segment + Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); + d.add(f); + char[] chars = new char[2]; + Random rnd = newRandom(); + final Set allTerms = new HashSet(); + + for(int i=0;i<200;i++) { + + final String s; + if (rnd.nextBoolean()) { + // Single char + if (rnd.nextBoolean()) { + // Above surrogates + chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff); + } else { + // Below surrogates + chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1); + } + s = new String(chars, 0, 1); + } else { + // Surrogate pair + chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END); + assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END); + chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END); + s = new String(chars, 0, 2); + } + allTerms.add(s); + f.setValue(s); + + //System.out.println("add " + termDesc(s)); + writer.addDocument(d); + + if ((1+i) % 42 == 0) { + writer.commit(); + } + } + + IndexReader r = writer.getReader(); + + // Test each sub-segment + final IndexReader[] subs = r.getSequentialSubReaders(); + assertEquals(5, subs.length); + for(int i=0;i 256 + for(int i=0;i<300;i++) { + s.append(' ').append(""+i); + } + Document d = new Document(); + Field f = new Field("field", s.toString(), Field.Store.NO, Field.Index.ANALYZED); + d.add(f); + w.addDocument(d); + IndexReader r = w.getReader(2).getSequentialSubReaders()[0]; + TermsEnum t = r.fields().terms("field").iterator(); + int count = 0; + while(t.next() != null) { + final DocsEnum docs = t.docs(null, null); + assertEquals(0, docs.nextDoc()); + assertEquals(DocsEnum.NO_MORE_DOCS, docs.nextDoc()); + count++; + } + assertEquals(300, count); + r.close(); + w.close(); + dir.close(); + } + public void testDeleteUnusedFiles() throws Exception { for(int iter=0;iter<2;iter++) { Index: src/test/org/apache/lucene/index/TestIndexWriterConfig.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterConfig.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexWriterConfig.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.index.DocumentsWriter.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; @@ -81,6 +82,7 @@ assertEquals(IndexWriterConfig.DEFAULT_READER_POOLING, conf.getReaderPooling()); assertTrue(DocumentsWriter.defaultIndexingChain == conf.getIndexingChain()); assertNull(conf.getMergedSegmentWarmer()); + assertEquals(IndexWriterConfig.DEFAULT_CODEC_PROVIDER, CodecProvider.getDefault()); assertEquals(IndexWriterConfig.DEFAULT_MAX_THREAD_STATES, conf.getMaxThreadStates()); assertEquals(LogByteSizeMergePolicy.class, conf.getMergePolicy().getClass()); @@ -101,6 +103,7 @@ getters.add("getMaxBufferedDocs"); getters.add("getIndexingChain"); getters.add("getMergedSegmentWarmer"); + getters.add("getCodecProvider"); getters.add("getMergePolicy"); getters.add("getMaxThreadStates"); getters.add("getReaderPooling"); Index: src/test/org/apache/lucene/index/TestIndexWriterDelete.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterDelete.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexWriterDelete.java (working copy) @@ -18,7 +18,6 @@ */ import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; @@ -770,30 +769,22 @@ } } - String[] startFiles = dir.listAll(); - SegmentInfos infos = new SegmentInfos(); - infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); - String[] endFiles = dir.listAll(); + TestIndexWriter.assertNoUnreferencedFiles(dir, "docsWriter.abort() failed to delete unreferenced files"); + modifier.close(); + } - if (!Arrays.equals(startFiles, endFiles)) { - fail("docswriter abort() failed to delete unreferenced files:\n before delete:\n " - + arrayToString(startFiles) + "\n after delete:\n " - + arrayToString(endFiles)); + public void testDeleteNullQuery() throws IOException { + Directory dir = new MockRAMDirectory(); + IndexWriter modifier = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + + for (int i = 0; i < 5; i++) { + addDoc(modifier, i, 2*i); } + modifier.deleteDocuments(new TermQuery(new Term("nada", "nada"))); + modifier.commit(); + assertEquals(5, modifier.numDocs()); modifier.close(); - + dir.close(); } - - private String arrayToString(String[] l) { - String s = ""; - for (int i = 0; i < l.length; i++) { - if (i > 0) { - s += "\n "; - } - s += l[i]; - } - return s; - } } Index: src/test/org/apache/lucene/index/TestIndexWriterReader.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterReader.java (revision 931099) +++ src/test/org/apache/lucene/index/TestIndexWriterReader.java (working copy) @@ -84,7 +84,6 @@ // get a reader IndexReader r1 = writer.getReader(); - assertTrue(r1.isCurrent()); String id10 = r1.document(10).getField("id").stringValue(); @@ -92,20 +91,15 @@ newDoc.removeField("id"); newDoc.add(new Field("id", Integer.toString(8000), Store.YES, Index.NOT_ANALYZED)); writer.updateDocument(new Term("id", id10), newDoc); - assertFalse(r1.isCurrent()); IndexReader r2 = writer.getReader(); - assertTrue(r2.isCurrent()); assertEquals(0, count(new Term("id", id10), r2)); assertEquals(1, count(new Term("id", Integer.toString(8000)), r2)); r1.close(); writer.close(); - assertTrue(r2.isCurrent()); IndexReader r3 = IndexReader.open(dir1, true); - assertTrue(r3.isCurrent()); - assertTrue(r2.isCurrent()); assertEquals(0, count(new Term("id", id10), r3)); assertEquals(1, count(new Term("id", Integer.toString(8000)), r3)); @@ -149,18 +143,9 @@ createIndexNoClose(!optimize, "index2", writer2); writer2.close(); - IndexReader r0 = writer.getReader(); - assertTrue(r0.isCurrent()); writer.addIndexesNoOptimize(new Directory[] { dir2 }); - assertFalse(r0.isCurrent()); - r0.close(); IndexReader r1 = writer.getReader(); - assertTrue(r1.isCurrent()); - - writer.commit(); - assertTrue(r1.isCurrent()); - assertEquals(200, r1.maxDoc()); int index2df = r1.docFreq(new Term("indexname", "index2")); Index: src/test/org/apache/lucene/index/TestLazyProxSkipping.java =================================================================== --- src/test/org/apache/lucene/index/TestLazyProxSkipping.java (revision 931099) +++ src/test/org/apache/lucene/index/TestLazyProxSkipping.java (working copy) @@ -48,7 +48,7 @@ @Override public IndexInput openInput(String name) throws IOException { IndexInput ii = super.openInput(name); - if (name.endsWith(".prx")) { + if (name.endsWith(".prx") || name.endsWith(".pos") ) { // we decorate the proxStream with a wrapper class that allows to count the number of calls of seek() ii = new SeeksCountingStream(ii); } @@ -107,7 +107,7 @@ // check if the number of calls of seek() does not exceed the number of hits assertTrue(this.seeksCounter > 0); - assertTrue(this.seeksCounter <= numHits + 1); + assertTrue("seeksCounter=" + this.seeksCounter + " numHits=" + numHits, this.seeksCounter <= numHits + 1); } public void testLazySkipping() throws IOException { Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 931099) +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy) @@ -29,8 +29,9 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.util.LuceneTestCase; /** @@ -42,8 +43,18 @@ * */ public class TestMultiLevelSkipList extends LuceneTestCase { + + class CountingRAMDirectory extends MockRAMDirectory { + public IndexInput openInput(String fileName) throws IOException { + IndexInput in = super.openInput(fileName); + if (fileName.endsWith(".frq")) + in = new CountingStream(in); + return in; + } + } + public void testSimpleSkip() throws IOException { - RAMDirectory dir = new RAMDirectory(); + Directory dir = new CountingRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())); Term term = new Term("test", "a"); for (int i = 0; i < 5000; i++) { @@ -56,9 +67,8 @@ writer.close(); IndexReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); - tp.freqStream = new CountingStream(tp.freqStream); - + TermPositions tp = reader.termPositions(); + for (int i = 0; i < 2; i++) { counter = 0; tp.seek(term); Index: src/test/org/apache/lucene/index/TestNorms.java =================================================================== --- src/test/org/apache/lucene/index/TestNorms.java (revision 931099) +++ src/test/org/apache/lucene/index/TestNorms.java (working copy) @@ -186,6 +186,7 @@ assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); } } + ir.close(); } private void addDocs(Directory dir, int ndocs, boolean compound) throws IOException { Index: src/test/org/apache/lucene/index/TestOmitTf.java =================================================================== --- src/test/org/apache/lucene/index/TestOmitTf.java (revision 931099) +++ src/test/org/apache/lucene/index/TestOmitTf.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.Collection; +import java.util.Random; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -26,13 +27,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Searcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockRAMDirectory; @@ -85,20 +80,26 @@ // keep things constant d = new Document(); - // Reverese + // Reverse f1.setOmitTermFreqAndPositions(true); d.add(f1); f2.setOmitTermFreqAndPositions(false); d.add(f2); + Random rnd = newRandom(); + writer.addDocument(d); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(ram); + FlexTestUtil.verifyFlexVsPreFlex(rnd, ram); + SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); @@ -144,8 +145,12 @@ for(int i=0;i<30;i++) writer.addDocument(d); + Random rnd = newRandom(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -289,6 +294,15 @@ TermQuery q3 = new TermQuery(c); TermQuery q4 = new TermQuery(d); + PhraseQuery pq = new PhraseQuery(); + pq.add(a); + pq.add(c); + try { + searcher.search(pq, 10); + fail("did not hit expected exception"); + } catch (IllegalStateException ise) { + // expected + } searcher.search(q1, new CountingHitCollector() { @@ -380,7 +394,7 @@ super.collect(doc); } }); - assertTrue(15 == CountingHitCollector.getCount()); + assertEquals(15, CountingHitCollector.getCount()); searcher.close(); dir.close(); Index: src/test/org/apache/lucene/index/TestPayloads.java =================================================================== --- src/test/org/apache/lucene/index/TestPayloads.java (revision 931099) +++ src/test/org/apache/lucene/index/TestPayloads.java (working copy) @@ -39,7 +39,8 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; @@ -99,7 +100,7 @@ // payload bit in the FieldInfo public void testPayloadFieldBit() throws Exception { rnd = newRandom(); - Directory ram = new RAMDirectory(); + Directory ram = new MockRAMDirectory(); PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(ram, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document d = new Document(); @@ -139,6 +140,9 @@ analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); writer.addDocument(d); + + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); + // force merge writer.optimize(); // flush @@ -149,14 +153,15 @@ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); - reader.close(); + reader.close(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, ram); } // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory public void testPayloadsEncoding() throws Exception { rnd = newRandom(); // first perform the test using a RAMDirectory - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); performTest(dir); // now use a FSDirectory and repeat same test @@ -215,7 +220,9 @@ writer.addDocument(d); } + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -260,11 +267,17 @@ TermPositions tp = reader.termPositions(terms[0]); tp.next(); tp.nextPosition(); + // NOTE: prior rev of this test was failing to first + // call next here: + tp.next(); // now we don't read this payload tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); byte[] payload = tp.getPayload(null, 0); assertEquals(payload[0], payloadData[numTerms]); + // NOTE: prior rev of this test was failing to first + // call next here: + tp.next(); tp.nextPosition(); // we don't read this payload and skip to a different document @@ -321,7 +334,9 @@ writer.addDocument(d); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); writer.optimize(); + FlexTestUtil.verifyFlexVsPreFlex(rnd, writer); // flush writer.close(); @@ -469,7 +484,7 @@ final int numDocs = 50; final ByteArrayPool pool = new ByteArrayPool(numThreads, 5); - Directory dir = new RAMDirectory(); + Directory dir = new MockRAMDirectory(); final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); final String field = "test"; @@ -563,13 +578,13 @@ } } - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); + private BytesRef utf8Result = new BytesRef(10); synchronized String bytesToString(byte[] bytes) { String s = new String(bytes); UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); try { - return new String(utf8Result.result, 0, utf8Result.length, "UTF-8"); + return new String(utf8Result.bytes, 0, utf8Result.length, "UTF-8"); } catch (UnsupportedEncodingException uee) { return null; } Index: src/test/org/apache/lucene/index/TestSegmentMerger.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentMerger.java (revision 931099) +++ src/test/org/apache/lucene/index/TestSegmentMerger.java (working copy) @@ -18,9 +18,11 @@ */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; import java.util.Collection; @@ -63,14 +65,16 @@ } public void testMerge() throws IOException { - SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment); + SegmentMerger merger = new SegmentMerger(mergedDir, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL, mergedSegment, null, CodecProvider.getDefault()); merger.add(reader1); merger.add(reader2); int docsMerged = merger.merge(); merger.closeReaders(); assertTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory - SegmentReader mergedReader = SegmentReader.get(true, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true), IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); + SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true, + -1, null, false, merger.hasProx(), merger.getCodec()), BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); + assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); Document newDoc1 = mergedReader.document(0); Index: src/test/org/apache/lucene/index/TestSegmentReader.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentReader.java (revision 931099) +++ src/test/org/apache/lucene/index/TestSegmentReader.java (working copy) @@ -136,6 +136,9 @@ TermPositions positions = reader.termPositions(); assertTrue(positions != null); positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); + // NOTE: prior rev of this test was failing to first + // call next here: + assertTrue(positions.next()); assertTrue(positions.doc() == 0); assertTrue(positions.nextPosition() >= 0); } Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 931099) +++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -56,13 +56,13 @@ SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) - { - int docId = segTermDocs.doc(); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -77,18 +77,20 @@ //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(true, info, indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } Index: src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 931099) +++ src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -67,14 +67,16 @@ addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); + TermsEnum terms = reader.fields().terms("content").iterator(); + assertNotNull(terms.next()); + assertEquals("aaa", terms.term().utf8ToString()); + assertNotNull(terms.next()); + long ordB = terms.ord(); + assertEquals("bbb", terms.term().utf8ToString()); + assertNull(terms.next()); + + assertEquals(TermsEnum.SeekStatus.FOUND, terms.seek(ordB)); + assertEquals("bbb", terms.term().utf8ToString()); } private void verifyDocFreq() Index: src/test/org/apache/lucene/index/TestStressIndexing.java =================================================================== --- src/test/org/apache/lucene/index/TestStressIndexing.java (revision 931099) +++ src/test/org/apache/lucene/index/TestStressIndexing.java (working copy) @@ -26,7 +26,7 @@ import java.util.Random; import java.io.File; -public class TestStressIndexing extends LuceneTestCase { +public class TestStressIndexing extends MultiCodecTestCase { private Random RANDOM; private static abstract class TimedThread extends Thread { @@ -152,6 +152,8 @@ modifier.close(); + FlexTestUtil.verifyFlexVsPreFlex(RANDOM, directory); + for(int i=0;i docs = indexRandom(nThreads, iter, range, dir1, maxThreadStates, doReaderPooling); + //System.out.println("TEST: index serial"); indexSerial(docs, dir2); + //System.out.println("TEST: verify"); verifyEquals(dir1, dir2, "id"); + + FlexTestUtil.verifyFlexVsPreFlex(r, dir1); + FlexTestUtil.verifyFlexVsPreFlex(r, dir2); } } @@ -216,7 +225,7 @@ threads[i].join(); } - // w.optimize(); + //w.optimize(); w.close(); for (int i=0; i limit) break; - set.set(docId-docBase); + if (docId >= docBase) { + set.set(docId-docBase); + } } - docBase = limit; return set.isEmpty()?null:set; } public void reset(){ index = 0; - docBase = 0; } } Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 931099) +++ src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy) @@ -23,17 +23,17 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; /** * Tests {@link FuzzyQuery}. @@ -378,5 +378,10 @@ doc.add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new FuzzyQuery(new Term("dummy", "dummy")).hasNewAPI); + } } Index: src/test/org/apache/lucene/search/TestMultiPhraseQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 931099) +++ src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy) @@ -22,14 +22,17 @@ import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + import java.io.IOException; +import java.util.HashSet; import java.util.LinkedList; import java.util.Collections; @@ -45,7 +48,7 @@ } public void testPhrasePrefix() throws IOException { - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new SimpleAnalyzer(TEST_VERSION_CURRENT))); add("blueberry pie", writer); add("blueberry strudel", writer); @@ -101,6 +104,7 @@ termsWithPrefix.add(te.term()); } } while (te.next()); + ir.close(); query3.add(termsWithPrefix.toArray(new Term[0])); query3.add(new Term("body", "pizza")); @@ -139,7 +143,7 @@ // and all terms required. // The contained PhraseMultiQuery must contain exactly one term array. - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new SimpleAnalyzer(TEST_VERSION_CURRENT))); add("blueberry pie", writer); add("blueberry chewing gum", writer); @@ -164,10 +168,11 @@ assertEquals("Wrong number of hits", 2, hits.length); searcher.close(); + indexStore.close(); } public void testPhrasePrefixWithBooleanQuery() throws IOException { - RAMDirectory indexStore = new RAMDirectory(); + MockRAMDirectory indexStore = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig( TEST_VERSION_CURRENT, new StandardAnalyzer( TEST_VERSION_CURRENT, Collections.emptySet()))); @@ -190,7 +195,24 @@ ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; assertEquals("Wrong number of hits", 0, hits.length); searcher.close(); + indexStore.close(); } + + public void testNoDocs() throws Exception { + MockRAMDirectory indexStore = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(Version.LUCENE_CURRENT, new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED); + add("a note", "note", writer); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(indexStore, true); + + MultiPhraseQuery q = new MultiPhraseQuery(); + q.add(new Term("body", "a")); + q.add(new Term[] { new Term("body", "nope"), new Term("body", "nope") }); + assertEquals("Wrong number of hits", 0, searcher.search(q, null, 1).totalHits); + searcher.close(); + indexStore.close(); + } public void testHashCodeAndEquals(){ MultiPhraseQuery query1 = new MultiPhraseQuery(); Index: src/test/org/apache/lucene/search/TestMultiSearcher.java =================================================================== --- src/test/org/apache/lucene/search/TestMultiSearcher.java (revision 931099) +++ src/test/org/apache/lucene/search/TestMultiSearcher.java (working copy) @@ -403,22 +403,9 @@ // The scores from the IndexSearcher and Multisearcher should be the same // if the same similarity is used. - assertEquals("MultiSearcher score must be equal to single searcher score!", score1, scoreN, 1e-6); + assertEquals("MultiSearcher score must be equal to single esrcher score!", score1, scoreN, 1e-6); } - public void testDocFreq() throws IOException{ - RAMDirectory dir1 = new RAMDirectory(); - RAMDirectory dir2 = new RAMDirectory(); - - initIndex(dir1, 10, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc... - initIndex(dir2, 5, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc... - IndexSearcher searcher1 = new IndexSearcher(dir1, true); - IndexSearcher searcher2 = new IndexSearcher(dir2, true); - - MultiSearcher multiSearcher = getMultiSearcherInstance(new Searcher[]{searcher1, searcher2}); - assertEquals(15, multiSearcher.docFreq(new Term("contents","x"))); - } - public void testCreateDocFrequencyMap() throws IOException{ RAMDirectory dir1 = new RAMDirectory(); RAMDirectory dir2 = new RAMDirectory(); Index: src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 931099) +++ src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy) @@ -24,9 +24,11 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCaseJ4; import org.apache.lucene.util.NumericUtils; @@ -331,9 +333,15 @@ if (lower>upper) { int a=lower; lower=upper; upper=a; } + final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_INT), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_INT); + NumericUtils.intToPrefixCoded(lower, 0, lowerBytes); + NumericUtils.intToPrefixCoded(upper, 0, upperBytes); + // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! + final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, true); - TermRangeQuery cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, true); + TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true); TopDocs tTopDocs = searcher.search(tq, 1); TopDocs cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -341,7 +349,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, false); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, false); + cq=new TermRangeQuery(field, lowerString, upperString, false, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -349,7 +357,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test left exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, false, true); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), false, true); + cq=new TermRangeQuery(field, lowerString, upperString, false, true); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -357,7 +365,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test right exclusive range tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, false); - cq=new TermRangeQuery(field, NumericUtils.intToPrefixCoded(lower), NumericUtils.intToPrefixCoded(upper), true, false); + cq=new TermRangeQuery(field, lowerString, upperString, true, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -549,23 +557,24 @@ } private void testEnum(int lower, int upper) throws Exception { - NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, lower, upper, true, true); - FilteredTermEnum termEnum = q.getEnum(searcher.getIndexReader()); - try { - int count = 0; - do { - final Term t = termEnum.term(); - if (t != null) { - final int val = NumericUtils.prefixCodedToInt(t.text()); - assertTrue("value not in bounds", val >= lower && val <= upper); - count++; - } else break; - } while (termEnum.next()); - assertFalse(termEnum.next()); - if (VERBOSE) System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + "] contained " + count + " terms."); - } finally { - termEnum.close(); - } + NumericRangeQuery q = NumericRangeQuery.newIntRange("field4", 4, + lower, upper, true, true); + TermsEnum termEnum = q.getTermsEnum(searcher.getIndexReader()); + int count = 0; + while (termEnum.next() != null) { + final BytesRef t = termEnum.term(); + if (t != null) { + final int val = NumericUtils.prefixCodedToInt(t); + assertTrue("value not in bounds " + val + " >= " + lower + " && " + + val + " <= " + upper, val >= lower && val <= upper); + count++; + } else + break; + } + assertNull(termEnum.next()); + if (VERBOSE) System.out.println("TermEnum on 'field4' for range [" + lower + "," + upper + + "] contained " + count + " terms."); + } @Test Index: src/test/org/apache/lucene/search/TestNumericRangeQuery64.java =================================================================== --- src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 931099) +++ src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCaseJ4; import org.apache.lucene.util.NumericUtils; @@ -350,9 +351,15 @@ if (lower>upper) { long a=lower; lower=upper; upper=a; } + final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); + NumericUtils.longToPrefixCoded(lower, 0, lowerBytes); + NumericUtils.longToPrefixCoded(upper, 0, upperBytes); + // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! + final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true); - TermRangeQuery cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, true); + TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true); TopDocs tTopDocs = searcher.search(tq, 1); TopDocs cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -360,7 +367,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, false); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, false); + cq=new TermRangeQuery(field, lowerString, upperString, false, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -368,7 +375,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test left exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, true); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), false, true); + cq=new TermRangeQuery(field, lowerString, upperString, false, true); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -376,7 +383,7 @@ termCountC += cq.getTotalNumberOfTerms(); // test right exclusive range tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, false); - cq=new TermRangeQuery(field, NumericUtils.longToPrefixCoded(lower), NumericUtils.longToPrefixCoded(upper), true, false); + cq=new TermRangeQuery(field, lowerString, upperString, true, false); tTopDocs = searcher.search(tq, 1); cTopDocs = searcher.search(cq, 1); assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits ); @@ -583,4 +590,9 @@ // difference to int range is tested in TestNumericRangeQuery32 } + @Test @Deprecated + public void testBackwardsLayer() { + assertTrue(NumericRangeQuery.newLongRange("dummy", null, null, true, true).hasNewAPI); + } + } Index: src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 931099) +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -22,6 +22,7 @@ import java.io.StringReader; import java.util.Collection; import java.util.Collections; +import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; @@ -61,6 +62,8 @@ */ public class TestPositionIncrement extends LuceneTestCase { + final static boolean VERBOSE = false; + public void testSetPosition() throws Exception { Analyzer analyzer = new Analyzer() { @Override @@ -242,8 +245,8 @@ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); Document doc = new Document(); - doc.add(new Field("content", - new StringReader("a a b c d e a f g h i j a b k k"))); + doc.add(new Field("content", new StringReader( + "a a b c d e a f g h i j a b k k"))); writer.addDocument(doc); IndexReader r = writer.getReader(); @@ -271,31 +274,44 @@ count = 0; boolean sawZero = false; - //System.out.println("\ngetPayloadSpans test"); + if (VERBOSE) { + System.out.println("\ngetPayloadSpans test"); + } Spans pspans = snq.getSpans(is.getIndexReader()); while (pspans.next()) { - //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end()); + if (VERBOSE) { + System.out.println("doc " + pspans.doc() + ": span " + pspans.start() + + " to " + pspans.end()); + } Collection payloads = pspans.getPayload(); sawZero |= pspans.start() == 0; - count += payloads.size(); + for (@SuppressWarnings("unused") byte[] bytes : payloads) { + count++; + if (!VERBOSE) { + // do nothing + } else { + System.out.println(" payload: " + new String((byte[]) bytes)); + } + } } assertEquals(5, count); assertTrue(sawZero); - //System.out.println("\ngetSpans test"); + // System.out.println("\ngetSpans test"); Spans spans = snq.getSpans(is.getIndexReader()); count = 0; sawZero = false; while (spans.next()) { count++; sawZero |= spans.start() == 0; - //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end()); + // System.out.println(spans.doc() + " - " + spans.start() + " - " + + // spans.end()); } assertEquals(4, count); assertTrue(sawZero); - - //System.out.println("\nPayloadSpanUtil test"); + // System.out.println("\nPayloadSpanUtil test"); + sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader()); Collection pls = psu.getPayloadsForQuery(snq); @@ -355,7 +371,9 @@ } posIncrAttr.setPositionIncrement(posIncr); pos += posIncr; - // System.out.println("term=" + termAttr.term() + " pos=" + pos); + if (TestPositionIncrement.VERBOSE) { + System.out.println("term=" + termAttr.term() + " pos=" + pos); + } i++; return true; } else { Index: src/test/org/apache/lucene/search/TestPrefixQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestPrefixQuery.java (revision 931099) +++ src/test/org/apache/lucene/search/TestPrefixQuery.java (working copy) @@ -53,5 +53,15 @@ query = new PrefixQuery(new Term("category", "/Computers/Mac")); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals("One in /Computers/Mac", 1, hits.length); + + query = new PrefixQuery(new Term("category", "")); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("everything", 3, hits.length); } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new PrefixQuery(new Term("dummy", "dummy")).hasNewAPI); + } } Index: src/test/org/apache/lucene/search/TestSort.java =================================================================== --- src/test/org/apache/lucene/search/TestSort.java (revision 931099) +++ src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -37,6 +37,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.FieldValueHitQueue.Entry; @@ -277,7 +278,7 @@ sort.setSort( new SortField("string", SortField.STRING), new SortField("string2", SortField.STRING, true), - SortField.FIELD_DOC ); + SortField.FIELD_DOC); result = searcher.search(new MatchAllDocsQuery(), null, 500, sort).scoreDocs; @@ -336,56 +337,56 @@ FieldCache fc = FieldCache.DEFAULT; - sort.setSort (new SortField ("parser", new FieldCache.IntParser(){ - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.IntParser(){ + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " IntParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.FloatParser(){ - public final float parseFloat(final String val) { - return (float) Math.sqrt( val.charAt(0) ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.FloatParser(){ + public final float parseFloat(final BytesRef term) { + return (float) Math.sqrt( term.bytes[term.offset] ); } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " FloatParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.LongParser(){ - public final long parseLong(final String val) { - return (val.charAt(0)-'A') * 1234567890L; + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.LongParser(){ + public final long parseLong(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 1234567890L; } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " LongParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.DoubleParser(){ - public final double parseDouble(final String val) { - return Math.pow( val.charAt(0), (val.charAt(0)-'A') ); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.DoubleParser(){ + public final double parseDouble(final BytesRef term) { + return Math.pow( term.bytes[term.offset], (term.bytes[term.offset]-'A') ); } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " DoubleParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ByteParser(){ - public final byte parseByte(final String val) { - return (byte) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ByteParser(){ + public final byte parseByte(final BytesRef term) { + return (byte) (term.bytes[term.offset]-'A'); } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ByteParser"); fc.purgeAllCaches(); - sort.setSort (new SortField ("parser", new FieldCache.ShortParser(){ - public final short parseShort(final String val) { - return (short) (val.charAt(0)-'A'); + sort.setSort (new SortField[] { new SortField ("parser", new FieldCache.ShortParser(){ + public final short parseShort(final BytesRef term) { + return (short) (term.bytes[term.offset]-'A'); } - }), SortField.FIELD_DOC ); + }), SortField.FIELD_DOC }); assertMatches (full, queryA, sort, "JIHGFEDCBA"); assertSaneFieldCaches(getName() + " ShortParser"); fc.purgeAllCaches(); @@ -443,8 +444,8 @@ @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { docValues = FieldCache.DEFAULT.getInts(reader, "parser", new FieldCache.IntParser() { - public final int parseInt(final String val) { - return (val.charAt(0)-'A') * 123456; + public final int parseInt(final BytesRef term) { + return (term.bytes[term.offset]-'A') * 123456; } }); } Index: src/test/org/apache/lucene/search/TestTermRangeQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestTermRangeQuery.java (revision 931099) +++ src/test/org/apache/lucene/search/TestTermRangeQuery.java (working copy) @@ -54,19 +54,11 @@ Query query = new TermRangeQuery("content", "A", "C", false, false); initializeIndex(new String[] {"A", "B", "C", "D"}); IndexSearcher searcher = new IndexSearcher(dir, true); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("A,B,C,D, only B in range", 1, hits.length); - searcher.close(); - initializeIndex(new String[] {"A", "B", "D"}); - searcher = new IndexSearcher(dir, true); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("A,B,D, only B in range", 1, hits.length); - searcher.close(); addDoc("C"); searcher = new IndexSearcher(dir, true); - hits = searcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals("C added, still only B in range", 1, hits.length); searcher.close(); } @@ -92,6 +84,25 @@ assertEquals("C added - A, B, C in range", 3, hits.length); searcher.close(); } + + public void testAllDocs() throws Exception { + initializeIndex(new String[]{"A", "B", "C", "D"}); + IndexSearcher searcher = new IndexSearcher(dir, true); + TermRangeQuery query = new TermRangeQuery("content", null, null, true, true); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + query = new TermRangeQuery("content", null, null, false, false); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + query = new TermRangeQuery("content", "", null, true, false); + assertFalse(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); + // and now anothe one + query = new TermRangeQuery("content", "B", null, true, false); + assertTrue(query.getTermsEnum(searcher.getIndexReader()) instanceof TermRangeTermsEnum); + assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length); + searcher.close(); + } /** This test should not be here, but it tests the fuzzy query rewrite mode (TOP_TERMS_SCORING_BOOLEAN_REWRITE) * with constant score and checks, that only the lower end of terms is put into the range */ @@ -402,4 +413,9 @@ //assertEquals("C added => A,B,,C in range", 3, hits.length()); searcher.close(); } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new TermRangeQuery("dummy", null, null, true, true).hasNewAPI); + } } Index: src/test/org/apache/lucene/search/TestTermScorer.java =================================================================== --- src/test/org/apache/lucene/search/TestTermScorer.java (revision 931099) +++ src/test/org/apache/lucene/search/TestTermScorer.java (working copy) @@ -71,9 +71,8 @@ Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); //we have 2 documents with the term all in them, one document for all the other values final List docs = new ArrayList(); //must call next first @@ -137,9 +136,8 @@ Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue("score is not correct", ts.score() == 1.6931472f); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); @@ -147,16 +145,15 @@ assertTrue("next returned a doc and it should not have", ts.nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } - public void testSkipTo() throws Exception { + public void testAdvance() throws Exception { Term allTerm = new Term(FIELD, "all"); TermQuery termQuery = new TermQuery(allTerm); Weight weight = termQuery.weight(indexSearcher); - TermScorer ts = new TermScorer(weight, - indexReader.termDocs(allTerm), indexSearcher.getSimilarity(), - indexReader.norms(FIELD)); + Scorer ts = weight.scorer(indexSearcher.getIndexReader(), + true, true); assertTrue("Didn't skip", ts.advance(3) != DocIdSetIterator.NO_MORE_DOCS); //The next doc should be doc 5 assertTrue("doc should be number 5", ts.docID() == 5); Index: src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- src/test/org/apache/lucene/search/TestWildcard.java (revision 931099) +++ src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Index; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; @@ -121,30 +122,12 @@ MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*")); assertMatches(searcher, wq, 2); - MultiTermQuery expected = new PrefixQuery(new Term("field", "prefix")); - wq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.1F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); + assertTrue(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); - wq.setBoost(0.2F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - wq.setBoost(0.3F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); - - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); - wq.setBoost(0.4F); - expected.setRewriteMethod(wq.getRewriteMethod()); - expected.setBoost(wq.getBoost()); - assertEquals(searcher.rewrite(expected), searcher.rewrite(wq)); + wq = new WildcardQuery(new Term("field", "*")); + assertMatches(searcher, wq, 2); + assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof PrefixTermsEnum); + assertFalse(wq.getTermsEnum(searcher.getIndexReader()) instanceof AutomatonTermsEnum); } /** @@ -326,5 +309,62 @@ searcher.close(); } + @Deprecated + private static final class OldWildcardQuery extends MultiTermQuery { + final Term term; + OldWildcardQuery(Term term) { + this.term = term; + } + + @Override + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new WildcardTermEnum(reader, term); + } + + @Override + public String toString(String field) { + return "OldWildcard(" + term.toString()+ ")"; + } + } + + @Deprecated + public void testDeprecatedTermEnum() throws Exception { + RAMDirectory indexStore = getIndexStore("body", new String[] + {"metal", "metals"}); + IndexSearcher searcher = new IndexSearcher(indexStore, true); + Query query1 = new TermQuery(new Term("body", "metal")); + Query query2 = new OldWildcardQuery(new Term("body", "metal*")); + Query query3 = new OldWildcardQuery(new Term("body", "m*tal")); + Query query4 = new OldWildcardQuery(new Term("body", "m*tal*")); + Query query5 = new OldWildcardQuery(new Term("body", "m*tals")); + + BooleanQuery query6 = new BooleanQuery(); + query6.add(query5, BooleanClause.Occur.SHOULD); + + BooleanQuery query7 = new BooleanQuery(); + query7.add(query3, BooleanClause.Occur.SHOULD); + query7.add(query5, BooleanClause.Occur.SHOULD); + + // Queries do not automatically lower-case search terms: + Query query8 = new OldWildcardQuery(new Term("body", "M*tal*")); + + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 2); + assertMatches(searcher, query3, 1); + assertMatches(searcher, query4, 2); + assertMatches(searcher, query5, 1); + assertMatches(searcher, query6, 1); + assertMatches(searcher, query7, 2); + assertMatches(searcher, query8, 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tall")), 0); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal")), 1); + assertMatches(searcher, new OldWildcardQuery(new Term("body", "*tal*")), 2); + } + + @Deprecated + public void testBackwardsLayer() { + assertTrue(new WildcardQuery(new Term("body", "metal*")).hasNewAPI); + assertFalse(new OldWildcardQuery(new Term("body", "metal*")).hasNewAPI); + } } Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 931099) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -205,8 +205,10 @@ if (crashed) throw new IOException("cannot createOutput after crash"); init(); - if (preventDoubleWrite && createdFiles.contains(name) && !name.equals("segments.gen")) - throw new IOException("file \"" + name + "\" was already written to"); + synchronized(this) { + if (preventDoubleWrite && createdFiles.contains(name) && !name.equals("segments.gen")) + throw new IOException("file \"" + name + "\" was already written to"); + } if (noDeleteOpenFile && openFiles.containsKey(name)) throw new IOException("MockRAMDirectory: file \"" + name + "\" is still open: cannot overwrite"); RAMFile file = new RAMFile(this); @@ -229,7 +231,7 @@ return new MockRAMOutputStream(this, file, name); } - + @Override public synchronized IndexInput openInput(String name) throws IOException { RAMFile file = fileMap.get(name); @@ -237,11 +239,11 @@ throw new FileNotFoundException(name); else { if (openFiles.containsKey(name)) { - Integer v = openFiles.get(name); + Integer v = (Integer) openFiles.get(name); v = Integer.valueOf(v.intValue()+1); openFiles.put(name, v); } else { - openFiles.put(name, Integer.valueOf(1)); + openFiles.put(name, Integer.valueOf(1)); } } return new MockRAMInputStream(this, name, file); @@ -277,7 +279,7 @@ if (noDeleteOpenFile && openFiles.size() > 0) { // RuntimeException instead of IOException because // super() does not throw IOException currently: - throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles); + throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files"); } } Index: src/test/org/apache/lucene/store/MockRAMInputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 931099) +++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy) @@ -45,7 +45,7 @@ // all clones get closed: if (!isClone) { synchronized(dir) { - Integer v = dir.openFiles.get(name); + Integer v = (Integer) dir.openFiles.get(name); // Could be null when MockRAMDirectory.crash() was called if (v != null) { if (v.intValue() == 1) { Index: src/test/org/apache/lucene/TestDemo.java =================================================================== --- src/test/org/apache/lucene/TestDemo.java (revision 931099) +++ src/test/org/apache/lucene/TestDemo.java (working copy) @@ -24,11 +24,13 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -54,7 +56,8 @@ TEST_VERSION_CURRENT, analyzer).setMaxFieldLength(25000)); Document doc = new Document(); - String text = "This is the text to be indexed."; + String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); @@ -62,15 +65,17 @@ // Now search the index: IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true + + assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits); // Parse a simple query that searches for "text": QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "fieldname", analyzer); Query query = parser.parse("text"); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(query, null, 1).scoreDocs; assertEquals(1, hits.length); // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); - assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); + assertEquals(text, hitDoc.get("fieldname")); } isearcher.close(); directory.close(); Index: src/test/org/apache/lucene/TestSearchForDuplicates.java =================================================================== --- src/test/org/apache/lucene/TestSearchForDuplicates.java (revision 931099) +++ src/test/org/apache/lucene/TestSearchForDuplicates.java (working copy) @@ -89,6 +89,9 @@ for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(new Field(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES, Field.Index.ANALYZED)); + + // NOTE: this ID_FIELD produces no tokens since + // SimpleAnalyzer discards numbers d.add(new Field(ID_FIELD, Integer.toString(j), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d); } Index: src/test/org/apache/lucene/util/TestAttributeSource.java =================================================================== --- src/test/org/apache/lucene/util/TestAttributeSource.java (revision 931099) +++ src/test/org/apache/lucene/util/TestAttributeSource.java (working copy) @@ -27,27 +27,27 @@ public void testCaptureState() { // init a first instance AttributeSource src = new AttributeSource(); - TermAttribute termAtt = src.addAttribute(TermAttribute.class); + CharTermAttribute termAtt = src.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + termAtt.append("TestTerm"); typeAtt.setType("TestType"); final int hashCode = src.hashCode(); AttributeSource.State state = src.captureState(); // modify the attributes - termAtt.setTermBuffer("AnotherTestTerm"); + termAtt.setEmpty().append("AnotherTestTerm"); typeAtt.setType("AnotherTestType"); assertTrue("Hash code should be different", hashCode != src.hashCode()); src.restoreState(state); - assertEquals("TestTerm", termAtt.term()); + assertEquals("TestTerm", termAtt.toString()); assertEquals("TestType", typeAtt.type()); assertEquals("Hash code should be equal after restore", hashCode, src.hashCode()); // restore into an exact configured copy AttributeSource copy = new AttributeSource(); - copy.addAttribute(TermAttribute.class); + copy.addAttribute(CharTermAttribute.class); copy.addAttribute(TypeAttribute.class); copy.restoreState(state); assertEquals("Both AttributeSources should have same hashCode after restore", src.hashCode(), copy.hashCode()); @@ -57,17 +57,17 @@ AttributeSource src2 = new AttributeSource(); typeAtt = src2.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = src2.addAttribute(FlagsAttribute.class); - termAtt = src2.addAttribute(TermAttribute.class); + termAtt = src2.addAttribute(CharTermAttribute.class); flagsAtt.setFlags(12345); src2.restoreState(state); - assertEquals("TestTerm", termAtt.term()); + assertEquals("TestTerm", termAtt.toString()); assertEquals("TestType", typeAtt.type()); assertEquals("FlagsAttribute should not be touched", 12345, flagsAtt.getFlags()); // init a third instance missing one Attribute AttributeSource src3 = new AttributeSource(); - termAtt = src3.addAttribute(TermAttribute.class); + termAtt = src3.addAttribute(CharTermAttribute.class); try { src3.restoreState(state); fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); @@ -78,42 +78,42 @@ public void testCloneAttributes() { final AttributeSource src = new AttributeSource(); - final TermAttribute termAtt = src.addAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt = src.addAttribute(FlagsAttribute.class); final TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + flagsAtt.setFlags(1234); typeAtt.setType("TestType"); final AttributeSource clone = src.cloneAttributes(); final Iterator> it = clone.getAttributeClassesIterator(); - assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("FlagsAttribute must be the first attribute", FlagsAttribute.class, it.next()); assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); assertFalse("No more attributes", it.hasNext()); - final TermAttribute termAtt2 = clone.getAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt2 = clone.getAttribute(FlagsAttribute.class); final TypeAttribute typeAtt2 = clone.getAttribute(TypeAttribute.class); - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); // test copy back - termAtt2.setTermBuffer("OtherTerm"); + flagsAtt2.setFlags(4711); typeAtt2.setType("OtherType"); clone.copyTo(src); - assertEquals("TermAttribute of original must now contain updated term", "OtherTerm", termAtt.term()); + assertEquals("FlagsAttribute of original must now contain updated term", 4711, flagsAtt.getFlags()); assertEquals("TypeAttribute of original must now contain updated type", "OtherType", typeAtt.type()); // verify again: - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); } public void testToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); - TermAttribute termAtt = src.addAttribute(TermAttribute.class); + CharTermAttribute termAtt = src.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + termAtt.append("TestTerm"); typeAtt.setType("TestType"); assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString()); Iterator it = src.getAttributeImplsIterator(); @@ -125,23 +125,23 @@ src = new AttributeSource(); src.addAttributeImpl(new Token()); - // this should not add a new attribute as Token implements TermAttribute, too - termAtt = src.addAttribute(TermAttribute.class); - assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token); + // this should not add a new attribute as Token implements CharTermAttribute, too + termAtt = src.addAttribute(CharTermAttribute.class); + assertTrue("CharTermAttribute should be implemented by Token", termAtt instanceof Token); // get the Token attribute and check, that it is the only one it = src.getAttributeImplsIterator(); Token tok = (Token) it.next(); assertFalse("There should be only one attribute implementation instance", it.hasNext()); - termAtt.setTermBuffer("TestTerm"); + termAtt.setEmpty().append("TestTerm"); assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString()); } public void testDefaultAttributeFactory() throws Exception { AttributeSource src = new AttributeSource(); - assertTrue("TermAttribute is not implemented by TermAttributeImpl", - src.addAttribute(TermAttribute.class) instanceof TermAttributeImpl); + assertTrue("CharTermAttribute is not implemented by CharTermAttributeImpl", + src.addAttribute(CharTermAttribute.class) instanceof CharTermAttributeImpl); assertTrue("OffsetAttribute is not implemented by OffsetAttributeImpl", src.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl); assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl", Property changes on: src\test\org\apache\lucene\util\TestAttributeSource.java ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /lucene/java/trunk/src/test/org/apache/lucene/util/TestAttributeSource.java:r924732-924780,924782-925175,925463-925561 Reverse-merged /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/util/TestAttributeSource.java:r896850 Merged /lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestAttributeSource.java:r824912-931101 Index: src/test/org/apache/lucene/util/TestNumericUtils.java =================================================================== --- src/test/org/apache/lucene/util/TestNumericUtils.java (revision 931099) +++ src/test/org/apache/lucene/util/TestNumericUtils.java (working copy) @@ -25,33 +25,37 @@ public void testLongConversionAndOrdering() throws Exception { // generate a series of encoded longs, each numerical one bigger than the one before - String last=null; + BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_LONG); for (long l=-100000L; l<100000L; l++) { - String act=NumericUtils.longToPrefixCoded(l); + NumericUtils.longToPrefixCoded(l, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works assertEquals("forward and back conversion should generate same long", l, NumericUtils.prefixCodedToLong(act)); // next step - last=act; + last = act; + act = new BytesRef(NumericUtils.BUF_SIZE_LONG); } } public void testIntConversionAndOrdering() throws Exception { // generate a series of encoded ints, each numerical one bigger than the one before - String last=null; + BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_INT); for (int i=-100000; i<100000; i++) { - String act=NumericUtils.intToPrefixCoded(i); + NumericUtils.intToPrefixCoded(i, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last", last.compareTo(act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works assertEquals("forward and back conversion should generate same int", i, NumericUtils.prefixCodedToInt(act)); // next step last=act; + act = new BytesRef(NumericUtils.BUF_SIZE_INT); } } @@ -60,10 +64,11 @@ Long.MIN_VALUE, Long.MIN_VALUE+1, Long.MIN_VALUE+2, -5003400000000L, -4000L, -3000L, -2000L, -1000L, -1L, 0L, 1L, 10L, 300L, 50006789999999999L, Long.MAX_VALUE-2, Long.MAX_VALUE-1, Long.MAX_VALUE }; - String[] prefixVals=new String[vals.length]; + BytesRef[] prefixVals=new BytesRef[vals.length]; for (int i=0; i