Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 956375) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -179,7 +179,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -263,7 +263,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 956375) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -4621,38 +4621,22 @@ private void checkTermsOrder(IndexReader r, Set allTerms, boolean isTop) throws IOException { TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); - char[] last = new char[2]; - int lastLength = 0; + BytesRef last = new BytesRef(); Set seenTerms = new HashSet(); - UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); while(true) { final BytesRef term = terms.next(); if (term == null) { break; } - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - assertTrue(utf16.length <= 2); - // Make sure last term comes before current one, in - // UTF16 sort order - int i = 0; - for(i=0;i pairs in a + Directory. A TermInfos can be written once, in order. */ + +final class TermInfosWriter { + /** The file format version, a negative number. */ + public static final int FORMAT = -3; + + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + + private FieldInfos fieldInfos; + private IndexOutput output; + private TermInfo lastTi = new TermInfo(); + private long size; + + // TODO: the default values for these two parameters should be settable from + // IndexWriter. However, once that's done, folks will start setting them to + // ridiculous values and complaining that things don't work well, as with + // mergeFactor. So, let's wait until a number of folks find that alternate + // values work better. Note that both of these values are stored in the + // segment, so that it's safe to change these w/o rebuilding all indexes. + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int indexInterval = 128; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + private long lastIndexPointer; + private boolean isIndex; + private byte[] lastTermBytes = new byte[10]; + private int lastTermBytesLength = 0; + private int lastFieldNumber = -1; + + private TermInfosWriter other; + private BytesRef utf8Result = new BytesRef(10); + + TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval) + throws IOException { + initialize(directory, segment, fis, interval, false); + other = new TermInfosWriter(directory, segment, fis, interval, true); + other.other = this; + } + + private TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval, boolean isIndex) throws IOException { + initialize(directory, segment, fis, interval, isIndex); + } + + private void initialize(Directory directory, String segment, FieldInfos fis, + int interval, boolean isi) throws IOException { + indexInterval = interval; + fieldInfos = fis; + isIndex = isi; + output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); + output.writeInt(FORMAT_CURRENT); // write format + output.writeLong(0); // leave space for size + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + output.writeInt(maxSkipLevels); // write maxSkipLevels + assert initUTF16Results(); + } + + void add(Term term, TermInfo ti) throws IOException { + UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result); + add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti); + } + + // Currently used only by assert statements + UnicodeUtil.UTF16Result utf16Result1; + UnicodeUtil.UTF16Result utf16Result2; + + // Currently used only by assert statements + private boolean initUTF16Results() { + utf16Result1 = new UnicodeUtil.UTF16Result(); + utf16Result2 = new UnicodeUtil.UTF16Result(); + return true; + } + + // Currently used only by assert statement + private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { + + if (lastFieldNumber != fieldNumber) { + final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); + // If there is a field named "" (empty string) then we + // will get 0 on this comparison, yet, it's "OK". But + // it's not OK if two different field numbers map to + // the same name. + if (cmp != 0 || lastFieldNumber != -1) + return cmp; + } + + UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); + UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); + final int len; + if (utf16Result1.length < utf16Result2.length) + len = utf16Result1.length; + else + len = utf16Result2.length; + + for(int i=0;i, TermInfo> pair to the set. + Term must be lexicographically greater than all previous Terms added. + TermInfo pointers must be positive and greater than all previous.*/ + void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) + throws IOException { + + assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || + (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); + + assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; + assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; + + if (!isIndex && size % indexInterval == 0) + other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term + + writeTerm(fieldNumber, termBytes, termBytesLength); // write term + + output.writeVInt(ti.docFreq); // write doc freq + output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers + output.writeVLong(ti.proxPointer - lastTi.proxPointer); + + if (ti.docFreq >= skipInterval) { + output.writeVInt(ti.skipOffset); + } + + if (isIndex) { + output.writeVLong(other.output.getFilePointer() - lastIndexPointer); + lastIndexPointer = other.output.getFilePointer(); // write pointer + } + + lastFieldNumber = fieldNumber; + lastTi.set(ti); + size++; + } + + private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) + throws IOException { + + // TODO: UTF16toUTF8 could tell us this prefix + // Compute prefix in common with last term: + int start = 0; + final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; + while(start < limit) { + if (termBytes[start] != lastTermBytes[start]) + break; + start++; + } + + final int length = termBytesLength - start; + output.writeVInt(start); // write shared prefix length + output.writeVInt(length); // write delta length + output.writeBytes(termBytes, start, length); // write delta bytes + output.writeVInt(fieldNumber); // write field num + if (lastTermBytes.length < termBytesLength) { + lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); + } + System.arraycopy(termBytes, start, lastTermBytes, start, length); + lastTermBytesLength = termBytesLength; + } + + /** Called to complete TermInfos creation. */ + void close() throws IOException { + output.seek(4); // write size after format + output.writeLong(size); + output.close(); + + if (!isIndex) + other.close(); + } + +} Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) @@ -0,0 +1,206 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.*; +import org.apache.lucene.index.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.util.*; + +import java.util.*; +import java.io.IOException; + +public class TestSurrogates extends LuceneTestCase { + + private static final boolean DEBUG = false; + + // like Term, but uses BytesRef for text + private static class FieldAndText implements Comparable { + String field; + BytesRef text; + + public FieldAndText(Term t) { + field = t.field(); + text = new BytesRef(t.text()); + } + + public int compareTo(FieldAndText other) { + if (other.field == field) { + return text.compareTo(other.text); + } else { + return field.compareTo(other.field); + } + } + } + + // chooses from a very limited alphabet to exacerbate the + // surrogate seeking required + private static String makeDifficultRandomUnicodeString(Random r) { + final int end = r.nextInt(20); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + + if (0 == t && i < end - 1) { + // hi + buffer[i++] = (char) 0xd800; + // lo + buffer[i] = (char) 0xdc00; + } else if (t <= 3) { + buffer[i] = 'a'; + } else if (4 == t) { + buffer[i] = 0xe000; + } + } + + return new String(buffer, 0, end); + } + + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + + final int numField = _TestUtil.nextInt(r, 2, 5); + + List terms = new ArrayList(); + + int tc = 0; + + for(int f=0;f fieldTerms = new ArrayList(); + SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); + + // hack alert!! + int uniqueTermCount = si.docCount; + + FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); + assertNotNull(fields); + + if (DEBUG) { + System.out.println("\nTEST: now enum"); + } + FieldsEnum fieldsEnum = fields.iterator(); + String field; + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + + int termCount = 0; + while((field = fieldsEnum.next()) != null) { + TermsEnum termsEnum = fieldsEnum.terms(); + BytesRef text; + BytesRef lastText = null; + while((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); + if (DEBUG) { + System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); + System.out.println(); + } + if (lastText == null) { + lastText = new BytesRef(text); + } else { + assertTrue(lastText.compareTo(text) < 0); + lastText.copy(text); + } + assertEquals(fieldTerms.get(termCount).field, field); + assertEquals(fieldTerms.get(termCount).text, text); + termCount++; + } + if (DEBUG) { + System.out.println(" no more terms for field=" + field); + } + } + assertEquals(uniqueTermCount, termCount); + + fields.close(); + } +} Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 956375) +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -141,7 +141,7 @@ else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800); else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); - else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff); + else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xfffe); } return new String(buffer, 0, end); } Index: lucene/src/test/org/apache/lucene/util/TestNumericUtils.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (revision 956375) +++ lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (working copy) @@ -30,7 +30,7 @@ NumericUtils.longToPrefixCoded(l, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -48,7 +48,7 @@ NumericUtils.intToPrefixCoded(i, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -84,7 +84,7 @@ // check sort order (prefixVals should be ascending) for (int i=1; i= 0) { - int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); + if (transition.getMax() >= c) { + int nextChar = Math.max(c, transition.getMin()); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; @@ -342,9 +338,9 @@ private boolean backtrack(int position) { while (position > 0) { int nextChar = seekBytesRef.bytes[position - 1] & 0xff; - // if a character is 0xef its a dead-end too, - // because there is no higher character in UTF-16 sort order. - nextChar = incrementUTF16(nextChar); + // if a character is 0xff its a dead-end too, + // because there is no higher character in UTF-8 sort order. + nextChar = incrementUTF8(nextChar); if (nextChar != -1) { seekBytesRef.bytes[position - 1] = (byte) nextChar; seekBytesRef.length = position; @@ -355,34 +351,11 @@ return false; /* all solutions exhausted */ } - /* return the next utf8 byte in utf16 order, or -1 if exhausted */ - private final int incrementUTF16(int utf8) { + /* return the next utf8 byte in utf8 order, or -1 if exhausted */ + private final int incrementUTF8(int utf8) { switch(utf8) { - case 0xed: return 0xf0; - case 0xfd: return 0xee; - case 0xee: return 0xef; - case 0xef: return -1; + case 0xff: return -1; default: return utf8 + 1; } } - - int compareToUTF16(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } } Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfos.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/FieldInfos.java (working copy) @@ -53,7 +53,7 @@ private final HashMap byName = new HashMap(); private int format; - FieldInfos() { } + public FieldInfos() { } /** * Construct a FieldInfos object using the directory and the name of the file @@ -62,7 +62,7 @@ * @param name The name of the file to open the IndexInput from in the Directory * @throws IOException */ - FieldInfos(Directory d, String name) throws IOException { + public FieldInfos(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { read(input, name); Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) @@ -144,8 +144,7 @@ @Override public Comparator getComparator() { - // return an unused dummy to prevent NPE - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return null; } @Override Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -130,7 +130,7 @@ // TODO: we may want to make this sort in same order // as Codec's terms dict? - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexWriter.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -3964,7 +3964,7 @@ // commit merged deletes SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores, MERGE_READ_BUFFER_SIZE, - -1); + -config.getReaderTermsIndexDivisor()); // We clone the segment readers because other // deletes may come in while we're merging so we Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -32,7 +32,7 @@ import java.io.IOException; import java.io.File; import java.util.Collection; - +import java.util.Comparator; import java.util.List; import java.util.ArrayList; import java.util.Map; @@ -596,6 +596,10 @@ boolean hasOrd = true; final long termCountStart = status.termCount; + BytesRef lastTerm = null; + + Comparator termComp = terms.getComparator(); + while(true) { final BytesRef term = terms.next(); @@ -603,6 +607,17 @@ break; } + // make sure terms arrive in order according to + // the comp + if (lastTerm == null) { + lastTerm = new BytesRef(term); + } else { + if (termComp.compare(lastTerm, term) >= 0) { + throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term); + } + lastTerm.copy(term); + } + final int docFreq = terms.docFreq(); status.totFreq += docFreq; Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (working copy) @@ -80,7 +80,7 @@ // Terms dict success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -111,7 +111,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -126,7 +126,7 @@ state.dir, state.fieldInfos, state.segmentInfo.name, pulsingReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (working copy) @@ -63,7 +63,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (working copy) @@ -104,7 +104,7 @@ indexInterval = in.readInt(); this.indexDivisor = indexDivisor; - if (indexDivisor == -1) { + if (indexDivisor < 0) { totalIndexInterval = indexInterval; } else { // In case terms index gets loaded, later, on demand @@ -131,7 +131,7 @@ } success = true; } finally { - if (indexDivisor != -1) { + if (indexDivisor > 0) { in.close(); this.in = null; if (success) { Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (working copy) @@ -58,7 +58,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -85,7 +85,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -101,7 +101,7 @@ state.segmentInfo.name, postings, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy) @@ -53,6 +53,7 @@ long indexPointer = 0; int indexInterval; int skipInterval; + int newSuffixStart; int maxSkipLevels; private int formatM1SkipInterval; @@ -136,6 +137,7 @@ prevBuffer.set(termBuffer); termBuffer.read(input, fieldInfos); + newSuffixStart = termBuffer.newSuffixStart; termInfo.docFreq = input.readVInt(); // read doc freq termInfo.freqPointer += input.readVLong(); // read freq pointer Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy) @@ -19,7 +19,6 @@ import java.io.IOException; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.Term; @@ -34,6 +33,8 @@ private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); private BytesRef bytes = new BytesRef(10); + int newSuffixStart; + public final int compareTo(TermBuffer other) { if (field == other.field) // fields are interned return compareChars(text.result, text.length, other.text.result, other.text.length); @@ -60,23 +61,33 @@ int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; + if (bytes.bytes.length < totalLength) { + bytes.grow(totalLength); + } if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - if (bytes.bytes.length < totalLength) - bytes.bytes = new byte[totalLength]; bytes.length = totalLength; input.readBytes(bytes.bytes, start, length); UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: - if (bytes.bytes.length < totalLength) - bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength); bytes.length = totalLength; input.readBytes(bytes.bytes, start, length); UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); } + + while(true) { + newSuffixStart = text.offsets[start]; + if (newSuffixStart != -1) { + break; + } + if (--start == 0) { + newSuffixStart = 0; + break; + } + } this.field = fieldInfos.fieldName(input.readVInt()); } @@ -124,10 +135,11 @@ try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} - clone.dirty = true; clone.bytes = new BytesRef(10); clone.text = new UnicodeUtil.UTF16Result(); + clone.text.offsets = new int[text.offsets.length]; + System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length); clone.text.copyText(text); return clone; } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 956375) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -39,11 +39,15 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { + private static final boolean DEBUG_SURROGATES = false; + public TermInfosReader tis; public final TermInfosReader tisNoIndex; @@ -60,6 +64,16 @@ throws IOException { si = info; + + // NOTE: we must always load terms index, even for + // "sequential" scan during merging, because what is + // sequential to merger may not be to TermInfosReader + // since we do the surrogates dance: + // nocommit -- how to pull right value from IW? + if (indexDivisor < 0) { + indexDivisor = -indexDivisor; + } + TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor); if (indexDivisor == -1) { tisNoIndex = r; @@ -174,7 +188,6 @@ private class PreFlexFieldsEnum extends FieldsEnum { final Iterator it; private final PreTermsEnum termsEnum; - private int count; FieldInfo current; public PreFlexFieldsEnum() throws IOException { @@ -185,7 +198,6 @@ @Override public String next() { if (it.hasNext()) { - count++; current = it.next(); return current.name; } else { @@ -195,7 +207,7 @@ @Override public TermsEnum terms() throws IOException { - termsEnum.reset(current, count == 1); + termsEnum.reset(current); return termsEnum; } } @@ -209,14 +221,15 @@ @Override public TermsEnum iterator() throws IOException { PreTermsEnum termsEnum = new PreTermsEnum(); - termsEnum.reset(fieldInfo, false); + termsEnum.reset(fieldInfo); return termsEnum; } @Override public Comparator getComparator() { - // Pre-flex indexes always sorted in UTF16 order - return BytesRef.getUTF8SortedAsUTF16Comparator(); + // Pre-flex indexes always sorted in UTF16 order, but + // we remap on-the-fly to unicode order + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } @@ -227,37 +240,229 @@ private BytesRef current; private final BytesRef scratchBytesRef = new BytesRef(); - void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException { + private int[] surrogateSeekPending = new int[1]; + private boolean[] surrogateDidSeekBack = new boolean[1]; + private int surrogateSeekUpto; + private char[] pendingPrefix; + + private SegmentTermEnum seekTermEnum; + private Term protoTerm; + private int newSuffixStart; + + void reset(FieldInfo fieldInfo) throws IOException { this.fieldInfo = fieldInfo; + protoTerm = new Term(fieldInfo.name); if (termEnum == null) { - // First time reset is called - if (isFirstField) { - termEnum = getTermsDict().terms(); - skipNext = false; - } else { - termEnum = getTermsDict().terms(new Term(fieldInfo.name, "")); - skipNext = true; - } + termEnum = getTermsDict().terms(protoTerm); + seekTermEnum = getTermsDict().terms(protoTerm); } else { - final Term t = termEnum.term(); - if (t != null && t.field() == fieldInfo.name) { - // No need to seek -- we have already advanced onto - // this field. We must be @ first term because - // flex API will not advance this enum further, on - // seeing a different field. - } else { - assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned - final TermInfosReader tis = getTermsDict(); - tis.seekEnum(termEnum, new Term(fieldInfo.name, "")); + getTermsDict().seekEnum(termEnum, protoTerm); + } + skipNext = true; + + surrogateSeekUpto = 0; + newSuffixStart = 0; + + surrogatesDance(); + } + + private void surrogatesDance() throws IOException { + + // Tricky: prior to 4.0, Lucene index sorted terms in + // UTF16 order, but as of 4.0 we sort by Unicode code + // point order. These orders differ because of the + // surrrogates; so we have to fixup our enum, here, by + // carefully first seeking past the surrogates and + // then back again at the end. The process is + // recursive, since any given term could have multiple + // new occurrences of surrogate pairs, so we use a + // stack to record the pending seek-backs. + if (DEBUG_SURROGATES) { + System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); + } + + while(popPendingSeek()); + while(pushNewSurrogate()); + } + + // only for debugging + private String getStack() { + if (surrogateSeekUpto == 0) { + return "null"; + } else { + StringBuffer sb = new StringBuffer(); + for(int i=0;i 0) { + sb.append(' '); + } + sb.append(surrogateSeekPending[i]); } - skipNext = true; + sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); + return sb.toString(); } } + private boolean popPendingSeek() throws IOException { + if (DEBUG_SURROGATES) { + System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); + } + // if a .next() has advanced beyond the + // after-surrogates range we had last seeked to, we + // must seek back to the start and resume .next from + // there. this pops the pending seek off the stack. + final Term t = termEnum.term(); + if (surrogateSeekUpto > 0) { + final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; + if (DEBUG_SURROGATES) { + System.out.println(" seekPrefix=" + seekPrefix); + } + if (newSuffixStart < seekPrefix) { + assert pendingPrefix != null; + assert pendingPrefix.length > seekPrefix; + pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + if (DEBUG_SURROGATES) { + System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); + } + getTermsDict().seekEnum(termEnum, t2); + surrogateDidSeekBack[surrogateSeekUpto-1] = true; + + // +2 because we don't want to re-check the + // surrogates we just seek'd back to + newSuffixStart = seekPrefix + 2; + return true; + } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { + assert pendingPrefix != null; + assert pendingPrefix.length > seekPrefix; + pendingPrefix[seekPrefix] = 0xffff; + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + if (DEBUG_SURROGATES) { + System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); + } + getTermsDict().seekEnum(termEnum, t2); + if (DEBUG_SURROGATES) { + System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); + } + surrogateSeekUpto--; + + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + // force pop + newSuffixStart = -1; + } else { + newSuffixStart = termEnum.newSuffixStart; + } + + return true; + } + } + + return false; + } + + private boolean pushNewSurrogate() throws IOException { + if (DEBUG_SURROGATES) { + System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); + } + final Term t = termEnum.term(); + if (t == null || t.field() != fieldInfo.name) { + return false; + } + final String text = t.text(); + final int textLen = text.length(); + + for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { + + if (DEBUG_SURROGATES) { + System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); + } + + // the next() that we just did read in a new + // suffix, containing a surrogate pair + + // seek forward to see if there are any terms with + // this same prefix, but with characters after the + // surrogate range; if so, we must first iterate + // them, then seek back to the surrogates + + char[] testPrefix = new char[i+1]; + for(int j=0;j CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); - - private static class UTF8InUTF16Order { - protected int compareCodePoint(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } - } - - private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - return 0; - } - } - - public static final Comparator CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); - - private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - return 0; - } - } - - public static final Comparator CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); - - } Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 956375) +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -327,6 +327,29 @@ return array; } + public static boolean[] grow(boolean[] array, int minSize) { + if (array.length < minSize) { + boolean[] newArray = new boolean[oversize(minSize, 1)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static boolean[] grow(boolean[] array) { + return grow(array, 1 + array.length); + } + + public static boolean[] shrink(boolean[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, 1); + if (newSize != array.length) { + boolean[] newArray = new boolean[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static char[] grow(char[] array, int minSize) { if (array.length < minSize) { char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)]; Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 956375) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -217,14 +217,7 @@ bytes = ArrayUtil.grow(bytes, newLength); } - private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); - - public static Comparator getUTF8SortedAsUTF16Comparator() { - return utf8SortedAsUTF16SortOrder; - } - /** Unsigned byte order comparison */ - /* public int compareTo(BytesRef other) { if (this == other) return 0; @@ -245,52 +238,18 @@ // One is a prefix of the other, or, they are equal: return this.length - other.length; } - */ - /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change - * in the future to unsigned byte comparison. */ - public int compareTo(BytesRef other) { - if (this == other) return 0; + private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); - final byte[] aBytes = this.bytes; - int aUpto = this.offset; - final byte[] bBytes = other.bytes; - int bUpto = other.offset; - - final int aStop = aUpto + Math.min(this.length, other.length); - - while(aUpto < aStop) { - int aByte = aBytes[aUpto++] & 0xff; - int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - } - - // One is a prefix of the other, or, they are equal: - return this.length - other.length; + public static Comparator getUTF8SortedAsUnicodeComparator() { + return utf8SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUTF16Comparator implements Comparator { + private static class UTF8SortedAsUnicodeComparator implements Comparator { // Only singleton - private UTF8SortedAsUTF16Comparator() {}; + private UTF8SortedAsUnicodeComparator() {}; public int compare(BytesRef a, BytesRef b) { - final byte[] aBytes = a.bytes; int aUpto = a.offset; final byte[] bBytes = b.bytes; @@ -307,32 +266,15 @@ int aByte = aBytes[aUpto++] & 0xff; int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; + int diff = aByte - bByte; + if (diff != 0) { + return diff; } } // One is a prefix of the other, or, they are equal: return a.length - b.length; - } - - public boolean equals(Object other) { - return this == other; - } + } } public void writeExternal(ObjectOutput out) Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 956375) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -358,7 +358,6 @@ out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START); } } - offsets[upto] = outUpto; result.length = outUpto; } @@ -483,7 +482,7 @@ } } */ - public static final boolean validUTF16String(CharSequence s) { + public static boolean validUTF16String(CharSequence s) { final int size = s.length(); for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { @@ -559,7 +558,7 @@ /** Returns the number of code points in this utf8 * sequence. Behavior is undefined if the utf8 sequence * is invalid.*/ - public static final int codePointCount(BytesRef utf8) { + public static int codePointCount(BytesRef utf8) { int upto = utf8.offset; final int limit = utf8.offset + utf8.length; final byte[] bytes = utf8.bytes; @@ -673,4 +672,33 @@ } return new String(chars, 0, w); } + + // for debugging + public static String toHexString(String s) { + StringBuilder sb = new StringBuilder(); + for(int i=0;i 0) { + sb.append(' '); + } + if (ch < 128) { + sb.append(ch); + } else { + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + sb.append("H:"); + } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + sb.append("L:"); + } else if (ch > UNI_SUR_LOW_END) { + if (ch == 0xffff) { + sb.append("F:"); + } else { + sb.append("E:"); + } + } + + sb.append("0x" + Integer.toHexString(ch)); + } + } + return sb.toString(); + } } Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 956375) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy) @@ -426,7 +426,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } }; } Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (revision 956375) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (working copy) @@ -123,7 +123,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 956375) +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -808,7 +808,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -903,7 +903,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } }