Index: lucene/CHANGES.txt --- lucene/CHANGES.txt Tue Jun 21 04:53:16 2011 -0400 +++ lucene/CHANGES.txt Fri Jun 24 06:43:32 2011 -0400 @@ -281,6 +281,13 @@ * LUCENE-2953: In addition to changes in 3.x, PriorityQueue#initialize(int) function was moved into the ctor. (Uwe Schindler, Yonik Seeley) +* LUCENE-3225: Add TermsEnum.seekExact for faster seeking when you + don't need the ceiling term; renamed existing seek methods to either + seekCeil or seekExact; changed seekExact(ord) to return no value. + Fixed MemoryCodec and SimpleTextCodec to optimize the seekExact + case, and fixed places in Lucene to use seekExact when possible. + (Mike McCandless) + New features * LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -44,7 +44,20 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public boolean seekExact(BytesRef text, boolean useCache) { + final Term t = new Term(field, text); + int loc = Arrays.binarySearch(terms, t, InstantiatedTerm.termComparator); + if (loc < 0) { + return false; + } else { + upto = loc; + br.copy(text); + return true; + } + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache) { final Term t = new Term(field, text); int loc = Arrays.binarySearch(terms, t, InstantiatedTerm.termComparator); if (loc < 0) { @@ -63,19 +76,10 @@ } @Override - public SeekStatus seek(long ord) { + public void seekExact(long ord) { + assert (start + (int) ord) < terms.length; upto = start + (int) ord; - if (upto >= terms.length) { - return SeekStatus.END; - } - - if (terms[upto].field() == field) { - return SeekStatus.FOUND; - } else { - // make sure field was interned - assert !terms[upto].field().equals(field); - return SeekStatus.END; - } + assert field.equals(terms[upto].field()); } @Override @@ -144,9 +148,9 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { + public void seekExact(BytesRef term, TermState state) throws IOException { assert state != null && state instanceof OrdTermState; - seek(((OrdTermState)state).ord); // just use the ord for simplicity + seekExact(((OrdTermState)state).ord); // just use the ord for simplicity } } Index: lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java --- lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java Fri Jun 24 06:43:32 2011 -0400 @@ -133,9 +133,9 @@ Term t = new Term("c", "danny"); TermsEnum aprioriTermEnum = MultiFields.getTerms(aprioriReader, t.field()).iterator(); - aprioriTermEnum.seek(new BytesRef(t.text())); + aprioriTermEnum.seekCeil(new BytesRef(t.text())); TermsEnum testTermEnum = MultiFields.getTerms(testReader, t.field()).iterator(); - testTermEnum.seek(new BytesRef(t.text())); + testTermEnum.seekCeil(new BytesRef(t.text())); assertEquals(aprioriTermEnum.term(), testTermEnum.term()); DocsEnum aprioriTermDocs = aprioriTermEnum.docs(MultiFields.getDeletedDocs(aprioriReader), null); Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Fri Jun 24 06:43:32 2011 -0400 @@ -860,7 +860,18 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) { + public boolean seekExact(BytesRef text, boolean useCache) { + termUpto = Arrays.binarySearch(info.sortedTerms, text, termComparator); + if (termUpto >= 0) { + br.copy(info.sortedTerms[termUpto].getKey()); + return true; + } else { + return false; + } + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache) { termUpto = Arrays.binarySearch(info.sortedTerms, text, termComparator); if (termUpto < 0) { // not found; choose successor termUpto = -termUpto -1; @@ -877,13 +888,9 @@ } @Override - public SeekStatus seek(long ord) { + public void seekExact(long ord) { + assert ord < info.sortedTerms.length; termUpto = (int) ord; - if (ord < info.sortedTerms.length) { - return SeekStatus.FOUND; - } else { - return SeekStatus.END; - } } @Override @@ -939,9 +946,9 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { + public void seekExact(BytesRef term, TermState state) throws IOException { assert state != null; - this.seek(((OrdTermState)state).ord); + this.seekExact(((OrdTermState)state).ord); } @Override Index: lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java --- lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java Fri Jun 24 06:43:32 2011 -0400 @@ -186,7 +186,7 @@ } TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seek(termText) != TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekCeil(termText) != TermsEnum.SeekStatus.FOUND) { return 0; } Index: lucene/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java --- lucene/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java Fri Jun 24 06:43:32 2011 -0400 @@ -73,7 +73,7 @@ Document doc = ir.document(0); assertEquals("0", doc.get("id")); TermsEnum te = MultiFields.getTerms(ir, "id").iterator(); - assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(new BytesRef("1"))); + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef("1"))); assertNotSame("1", te.term().utf8ToString()); ir.close(); ir = IndexReader.open(dirs[1], true); @@ -81,7 +81,7 @@ doc = ir.document(0); assertEquals("1", doc.get("id")); te = MultiFields.getTerms(ir, "id").iterator(); - assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(new BytesRef("0"))); + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef("0"))); assertNotSame("0", te.term().utf8ToString()); ir.close(); @@ -91,10 +91,10 @@ assertEquals("2", doc.get("id")); te = MultiFields.getTerms(ir, "id").iterator(); - assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(new BytesRef("1"))); + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef("1"))); assertNotSame("1", te.term()); - assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(new BytesRef("0"))); + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef("0"))); assertNotSame("0", te.term().utf8ToString()); ir.close(); for (Directory d : dirs) @@ -132,7 +132,7 @@ // make sure the deleted doc is not here TermsEnum te = MultiFields.getTerms(ir, "id").iterator(); Term t = new Term("id", (NUM_DOCS - 1) + ""); - assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(new BytesRef(t.text()))); + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(t.text()))); assertNotSame(t.text(), te.term().utf8ToString()); ir.close(); for (Directory d : dirs) Index: lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java --- lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java Fri Jun 24 06:43:32 2011 -0400 @@ -526,7 +526,7 @@ //System.out.println("trigger " + trigger); shift = random.nextInt(trigger); } - termsEnum.seek(new BytesRef("")); + termsEnum.seekCeil(new BytesRef("")); continue; } seenTermCount++; Index: lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java --- lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java Fri Jun 24 06:43:32 2011 -0400 @@ -154,14 +154,14 @@ Terms terms = fields.terms("f"); assertNotNull(terms); TermsEnum te = terms.iterator(); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog"))); - assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("quick"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("brown"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("fox"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("jumped"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("over"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("lazy"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("dog"))); + assertEquals(SeekStatus.FOUND, te.seekCeil(new BytesRef("the"))); DocsEnum de = te.docs(null, null); assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS); assertEquals(2, de.freq()); Index: lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java --- lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java Fri Jun 24 06:43:32 2011 -0400 @@ -79,7 +79,7 @@ if (terms != null) { br.copy(term.bytes()); - if (termsEnum.seek(br) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) { docs = termsEnum.docs(delDocs, docs); while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { result.set(docs.docID()); Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java Fri Jun 24 06:43:32 2011 -0400 @@ -63,7 +63,7 @@ TermsEnum termsEnum = terms.iterator(); boolean skip = false; - TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getPrefix())); + TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTermQuery.java Fri Jun 24 06:43:32 2011 -0400 @@ -53,7 +53,7 @@ if (terms != null) { TermsEnum termsEnum = terms.iterator(); - TermsEnum.SeekStatus status = termsEnum.seek(new BytesRef(getTermText())); + TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java Fri Jun 24 06:43:32 2011 -0400 @@ -95,7 +95,7 @@ try { TermsEnum termsEnum = terms.iterator(); - TermsEnum.SeekStatus status = termsEnum.seek(prefixRef); + TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; Index: lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java --- lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java Fri Jun 24 06:43:32 2011 -0400 @@ -398,7 +398,7 @@ // System.out.println(" term=" + term); - if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(term.bytes(), false)) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); if (docsEnum != null) { Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java --- lucene/src/java/org/apache/lucene/index/CheckIndex.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java Fri Jun 24 06:43:32 2011 -0400 @@ -847,7 +847,7 @@ // Test seek to last term: if (lastTerm != null) { - if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) { + if (terms.seekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { throw new RuntimeException("seek to last term " + lastTerm + " failed"); } @@ -874,14 +874,14 @@ // Seek by ord for(int i=seekCount-1;i>=0;i--) { long ord = i*(termCount/seekCount); - terms.seek(ord); + terms.seekExact(ord); seekTerms[i] = new BytesRef(terms.term()); } // Seek by term long totDocCount = 0; for(int i=seekCount-1;i>=0;i--) { - if (terms.seek(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) { + if (terms.seekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) { throw new RuntimeException("seek to existing term " + seekTerms[i] + " failed"); } Index: lucene/src/java/org/apache/lucene/index/DocTermOrds.java --- lucene/src/java/org/apache/lucene/index/DocTermOrds.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/DocTermOrds.java Fri Jun 24 06:43:32 2011 -0400 @@ -237,7 +237,7 @@ final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); - if (te.seek(seekStart) == TermsEnum.SeekStatus.END) { + if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } @@ -693,7 +693,7 @@ } @Override - public SeekStatus seek(BytesRef target, boolean useCache) throws IOException { + public SeekStatus seekCeil(BytesRef target, boolean useCache) throws IOException { // already here if (term != null && term.equals(target)) { @@ -704,7 +704,7 @@ if (startIdx >= 0) { // we hit the term exactly... lucky us! - TermsEnum.SeekStatus seekStatus = termsEnum.seek(target); + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); @@ -717,7 +717,7 @@ if (startIdx == 0) { // our target occurs *before* the first term - TermsEnum.SeekStatus seekStatus = termsEnum.seek(target); + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); @@ -733,7 +733,7 @@ // so we don't need to seek. } else { // seek to the right block - TermsEnum.SeekStatus seekStatus = termsEnum.seek(indexedTermsArray[startIdx]); + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); @@ -754,16 +754,16 @@ } @Override - public SeekStatus seek(long targetOrd) throws IOException { + public void seekExact(long targetOrd) throws IOException { int delta = (int) (targetOrd - ordBase - ord); - //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord); + //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord + " ii=" + indexInterval); if (delta < 0 || delta > indexInterval) { final int idx = (int) (targetOrd >>> indexIntervalBits); final BytesRef base = indexedTermsArray[idx]; //System.out.println(" do seek term=" + base.utf8ToString()); ord = idx << indexIntervalBits; delta = (int) (targetOrd - ord); - final TermsEnum.SeekStatus seekStatus = termsEnum.seek(base, true); + final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base, true); assert seekStatus == TermsEnum.SeekStatus.FOUND; } else { //System.out.println("seek w/in block"); @@ -772,15 +772,14 @@ while (--delta >= 0) { BytesRef br = termsEnum.next(); if (br == null) { - term = null; - return null; + assert false; + return; } ord++; } setTerm(); - return term == null ? SeekStatus.END : SeekStatus.FOUND; - //System.out.println(" return term=" + term.utf8ToString()); + assert term != null; } private BytesRef setTerm() throws IOException { @@ -794,8 +793,7 @@ } public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException { - TermsEnum.SeekStatus status = termsEnum.seek(ord); - assert status == TermsEnum.SeekStatus.FOUND; + termsEnum.seekExact(ord); return termsEnum.term(); } } Index: lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java --- lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java Fri Jun 24 06:43:32 2011 -0400 @@ -99,7 +99,7 @@ final long ram = flushBytes + activeBytes; final long ramBufferBytes = (long) (maxConfiguredRamBuffer * 1024 * 1024); // take peakDelta into account - worst case is that all flushing, pending and blocked DWPT had maxMem and the last doc had the peakDelta - final long expected = (long)(2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta); + final long expected = (2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta); if (peakDelta < (ramBufferBytes >> 1)) { /* * if we are indexing with very low maxRamBuffer like 0.1MB memory can Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Fri Jun 24 06:43:32 2011 -0400 @@ -132,13 +132,18 @@ public FilterTermsEnum(TermsEnum in) { this.in = in; } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return in.seek(text, useCache); + public boolean seekExact(BytesRef text, boolean useCache) throws IOException { + return in.seekExact(text, useCache); } @Override - public SeekStatus seek(long ord) throws IOException { - return in.seek(ord); + public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException { + return in.seekCeil(text, useCache); + } + + @Override + public void seekExact(long ord) throws IOException { + in.seekExact(ord); } @Override @@ -182,8 +187,8 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { - in.seek(term, state); + public void seekExact(BytesRef term, TermState state) throws IOException { + in.seekExact(term, state); } @Override Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -139,7 +139,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public boolean seekExact(BytesRef term, boolean useCache) throws IOException { queue.clear(); numTop = 0; @@ -147,6 +147,56 @@ if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { seekOpt = true; } + + lastSeek = null; + + for(int i=0;i 0; + } + + @Override + public SeekStatus seekCeil(BytesRef term, boolean useCache) throws IOException { + queue.clear(); + numTop = 0; + + boolean seekOpt = false; + if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { + seekOpt = true; + } + lastSeekScratch.copy(term); lastSeek = lastSeekScratch; @@ -167,25 +217,27 @@ } else if (cmp < 0) { status = SeekStatus.NOT_FOUND; } else { - status = currentSubs[i].terms.seek(term, useCache); + status = currentSubs[i].terms.seekCeil(term, useCache); } } else { status = SeekStatus.END; } } else { - status = currentSubs[i].terms.seek(term, useCache); + status = currentSubs[i].terms.seekCeil(term, useCache); } if (status == SeekStatus.FOUND) { top[numTop++] = currentSubs[i]; current = currentSubs[i].current = currentSubs[i].terms.term(); - } else if (status == SeekStatus.NOT_FOUND) { - currentSubs[i].current = currentSubs[i].terms.term(); - assert currentSubs[i].current != null; - queue.add(currentSubs[i]); } else { - // enum exhausted - currentSubs[i].current = null; + if (status == SeekStatus.NOT_FOUND) { + currentSubs[i].current = currentSubs[i].terms.term(); + assert currentSubs[i].current != null; + queue.add(currentSubs[i]); + } else { + // enum exhausted + currentSubs[i].current = null; + } } } @@ -204,7 +256,7 @@ } @Override - public SeekStatus seek(long ord) throws IOException { + public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/index/TermState.java --- lucene/src/java/org/apache/lucene/index/TermState.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/TermState.java Fri Jun 24 06:43:32 2011 -0400 @@ -21,7 +21,7 @@ * Encapsulates all required internal state to position the associated * {@link TermsEnum} without re-seeking. * - * @see TermsEnum#seek(org.apache.lucene.util.BytesRef, TermState) + * @see TermsEnum#seekExact(org.apache.lucene.util.BytesRef, TermState) * @see TermsEnum#termState() * @lucene.experimental */ Index: lucene/src/java/org/apache/lucene/index/Terms.java --- lucene/src/java/org/apache/lucene/index/Terms.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/Terms.java Fri Jun 24 06:43:32 2011 -0400 @@ -50,7 +50,7 @@ * exist. */ public int docFreq(BytesRef text) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(text, true)) { return termsEnum.docFreq(); } else { return 0; @@ -62,7 +62,7 @@ * exist. */ public long totalTermFreq(BytesRef text) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } else { return 0; @@ -73,7 +73,7 @@ * method may return null if the term does not exist. */ public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(text, true)) { return termsEnum.docs(skipDocs, reuse); } else { return null; @@ -85,7 +85,7 @@ * exists, or positions were not indexed. */ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef text, DocsAndPositionsEnum reuse) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(text, true)) { return termsEnum.docsAndPositions(skipDocs, reuse); } else { return null; @@ -97,10 +97,10 @@ * This method may return null if the term does not exist. * * @see TermsEnum#termState() - * @see TermsEnum#seek(BytesRef, TermState) */ + * @see TermsEnum#seekExact(BytesRef, TermState) */ public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - termsEnum.seek(term, termState); + termsEnum.seekExact(term, termState); return termsEnum.docs(skipDocs, reuse); } @@ -110,10 +110,10 @@ * not indexed. * * @see TermsEnum#termState() - * @see TermsEnum#seek(BytesRef, TermState) */ + * @see TermsEnum#seekExact(BytesRef, TermState) */ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException { final TermsEnum termsEnum = getThreadTermsEnum(); - termsEnum.seek(term, termState); + termsEnum.seekExact(term, termState); return termsEnum.docsAndPositions(skipDocs, reuse); } Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java --- lucene/src/java/org/apache/lucene/index/TermsEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -24,18 +24,20 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -/** Iterator to seek ({@link #seek}) or step through ({@link - * #next} terms, obtain frequency information ({@link - * #docFreq}), and obtain a {@link DocsEnum} or {@link +/** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link + * #seekExact(BytesRef,boolean)}) or step through ({@link + * #next} terms to obtain frequency information ({@link + * #docFreq}), {@link DocsEnum} or {@link * DocsAndPositionsEnum} for the current term ({@link * #docs}. * *

Term enumerations are always ordered by * {@link #getComparator}. Each term in the enumeration is - * greater than all that precede it.

+ * greater than the one before it.

* - *

On obtaining a TermsEnum, you must first call - * {@link #next} or {@link #seek}. + *

The TermsEnum is unpositioned when you first obtain it + * and you must first successfully call {@link #next} or one + * of the seek methods. * * @lucene.experimental */ public abstract class TermsEnum { @@ -48,31 +50,41 @@ return atts; } - /** Represents returned result from {@link #seek}. + /** Represents returned result from {@link #seekCeil}. * If status is FOUND, then the precise term was found. * If status is NOT_FOUND, then a different term was * found. If the status is END, the end of the iteration * was hit. */ public static enum SeekStatus {END, FOUND, NOT_FOUND}; - /** Expert: just like {@link #seek(BytesRef)} but allows + /** Attemps to seek to the exact term, returning + * true if the term is found. If this returns false, the + * enum is unpositioned. For some codecs, seekExact may + * be substantially faster than {@link #seekCeil}. */ + public boolean seekExact(BytesRef text, boolean useCache) throws IOException { + return seekCeil(text, useCache) == SeekStatus.FOUND; + } + + /** Expert: just like {@link #seekCeil(BytesRef)} but allows * you to control whether the implementation should * attempt to use its term cache (if it uses one). */ - public abstract SeekStatus seek(BytesRef text, boolean useCache) throws IOException; + public abstract SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException; - /** Seeks to the specified term. Returns SeekStatus to + /** Seeks to the specified term, if it exists, or to the + * next (ceiling) term. Returns SeekStatus to * indicate whether exact term was found, a different * term was found, or EOF was hit. The target term may - * be before or after the current term. */ - public final SeekStatus seek(BytesRef text) throws IOException { - return seek(text, true); + * be before or after the current term. If this returns + * SeekStatus.END, the enum is unpositioned. */ + public final SeekStatus seekCeil(BytesRef text) throws IOException { + return seekCeil(text, true); } /** Seeks to the specified term by ordinal (position) as * previously returned by {@link #ord}. The target ord - * may be before or after the current ord. See {@link - * #seek(BytesRef)}. */ - public abstract SeekStatus seek(long ord) throws IOException; + * may be before or after the current ord, and must be + * within bounds. */ + public abstract void seekExact(long ord) throws IOException; /** * Expert: Seeks a specific position by {@link TermState} previously obtained @@ -82,8 +94,7 @@ *

* Seeking by {@link TermState} should only be used iff the enum the state was * obtained from and the enum the state is used for seeking are obtained from - * the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can - * leave the enum in undefined state. + * the same {@link IndexReader}. *

* NOTE: Using this method with an incompatible {@link TermState} might leave * this {@link TermsEnum} in undefined state. On a segment level @@ -97,32 +108,30 @@ * @param term the term the TermState corresponds to * @param state the {@link TermState} * */ - public void seek(BytesRef term, TermState state) throws IOException { - seek(term); + public void seekExact(BytesRef term, TermState state) throws IOException { + if (!seekExact(term, true)) { + throw new IllegalArgumentException("term=" + term + " does not exist"); + } } - /** Increments the enumeration to the next element. + /** Increments the enumeration to the next term. * Returns the resulting term, or null if the end was - * hit. The returned BytesRef may be re-used across calls - * to next. */ + * hit (which means the enum is unpositioned). The + * returned BytesRef may be re-used across calls to next. */ public abstract BytesRef next() throws IOException; - /** Returns current term. Do not call this before calling - * next() for the first time, after next() returns null - * or after seek returns {@link SeekStatus#END}.*/ + /** Returns current term. Do not call this when the enum + * is unpositioned. */ public abstract BytesRef term() throws IOException; /** Returns ordinal position for current term. This is an * optional method (the codec may throw {@link * UnsupportedOperationException}). Do not call this - * before calling {@link #next} for the first time or after - * {@link #next} returns null or {@link #seek} returns - * END; */ + * when the enum is unpositioned. */ public abstract long ord() throws IOException; /** Returns the number of documents containing the current - * term. Do not call this before calling next() for the - * first time, after next() returns null or seek returns + * term. Do not call this when the enum is unpositioned. * {@link SeekStatus#END}.*/ public abstract int docFreq() throws IOException; @@ -135,9 +144,8 @@ public abstract long totalTermFreq() throws IOException; /** Get {@link DocsEnum} for the current term. Do not - * call this before calling {@link #next} or {@link - * #seek} for the first time. This method will not - * return null. + * call this when the enum is unpositioned. This method + * will not return null. * * @param skipDocs set bits are documents that should not * be returned @@ -145,10 +153,9 @@ public abstract DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException; /** Get {@link DocsAndPositionsEnum} for the current term. - * Do not call this before calling {@link #next} or - * {@link #seek} for the first time. This method will - * only return null if positions were not indexed into - * the postings by this codec. */ + * Do not call this when the enum is unpositioned. + * This method will only return null if positions were + * not indexed into the postings by this codec. */ public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; /** @@ -160,7 +167,7 @@ * {@link AttributeSource} states separately * * @see TermState - * @see #seek(BytesRef, TermState) + * @see #seekExact(BytesRef, TermState) */ public TermState termState() throws IOException { return new TermState() { @@ -186,10 +193,10 @@ */ public static final TermsEnum EMPTY = new TermsEnum() { @Override - public SeekStatus seek(BytesRef term, boolean useCache) { return SeekStatus.END; } + public SeekStatus seekCeil(BytesRef term, boolean useCache) { return SeekStatus.END; } @Override - public SeekStatus seek(long ord) { return SeekStatus.END; } + public void seekExact(long ord) {} @Override public BytesRef term() { @@ -242,7 +249,7 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { + public void seekExact(BytesRef term, TermState state) throws IOException { throw new IllegalStateException("this method should never be called"); } }; Index: lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java --- lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java Fri Jun 24 06:43:32 2011 -0400 @@ -302,7 +302,7 @@ only valid if didIndexNext is true: */ private BytesRef nextIndexTerm; - /* True after seek(TermState), do defer seeking. If the app then + /* True after seekExact(TermState), do defer seeking. If the app then calls next() (which is not "typical"), then we'll do the real seek */ private boolean seekPending; @@ -348,7 +348,7 @@ // return NOT_FOUND so it's a waste for us to fill in // the term that was actually NOT_FOUND @Override - public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException { + public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException { if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); @@ -376,7 +376,7 @@ if (cachedState != null) { seekPending = true; //System.out.println(" cached!"); - seek(target, cachedState); + seekExact(target, cachedState); //System.out.println(" term=" + term.utf8ToString()); return SeekStatus.FOUND; } @@ -711,7 +711,7 @@ } @Override - public void seek(BytesRef target, TermState otherState) throws IOException { + public void seekExact(BytesRef target, TermState otherState) throws IOException { //System.out.println("BTR.seek termState target=" + target.utf8ToString() + " " + target + " this=" + this); assert otherState != null && otherState instanceof BlockTermState; assert !doOrd || ((BlockTermState) otherState).ord < numTerms; @@ -731,16 +731,13 @@ } @Override - public SeekStatus seek(long ord) throws IOException { + public void seekExact(long ord) throws IOException { //System.out.println("BTR.seek by ord ord=" + ord); if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } - if (ord >= numTerms) { - state.ord = numTerms-1; - return SeekStatus.END; - } + assert ord < numTerms; // TODO: if ord is in same terms block and // after current ord, we should avoid this seek just @@ -768,9 +765,6 @@ left--; assert indexIsCurrent; } - - // always found - return SeekStatus.FOUND; } @Override Index: lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java --- lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java Fri Jun 24 06:43:32 2011 -0400 @@ -561,7 +561,15 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { + public boolean seekExact(BytesRef text, boolean useCache /* ignored */) throws IOException { + if (VERBOSE) System.out.println("te.seekExact text=" + field.name + ":" + text.utf8ToString() + " this=" + this); + current = fstEnum.seekExact(text); + didDecode = false; + return current != null; + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException { if (VERBOSE) System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this); current = fstEnum.seekCeil(text); if (current == null) { @@ -656,7 +664,7 @@ } @Override - public SeekStatus seek(long ord) { + public void seekExact(long ord) { // NOTE: we could add this... throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Fri Jun 24 06:43:32 2011 -0400 @@ -745,7 +745,7 @@ } @Override - public SeekStatus seek(long ord) throws IOException { + public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException(); } @@ -755,7 +755,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public SeekStatus seekCeil(BytesRef term, boolean useCache) throws IOException { if (DEBUG_SURROGATES) { System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Fri Jun 24 06:43:32 2011 -0400 @@ -131,7 +131,23 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { + public boolean seekExact(BytesRef text, boolean useCache /* ignored */) throws IOException { + + final BytesRefFSTEnum.InputOutput>> result = fstEnum.seekExact(text); + if (result != null) { + PairOutputs.Pair> pair1 = result.output; + PairOutputs.Pair pair2 = pair1.output2; + docsStart = pair1.output1; + docFreq = pair2.output1.intValue(); + totalTermFreq = pair2.output2; + return true; + } else { + return false; + } + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException { //System.out.println("seek to text=" + text.utf8ToString()); final BytesRefFSTEnum.InputOutput>> result = fstEnum.seekCeil(text); @@ -183,7 +199,7 @@ } @Override - public SeekStatus seek(long ord) { + public void seekExact(long ord) { throw new UnsupportedOperationException(); } Index: lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/FilteredTermsEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -135,7 +135,7 @@ * @throws UnsupportedOperationException */ @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + public boolean seekExact(BytesRef term, boolean useCache) throws IOException { throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } @@ -143,7 +143,15 @@ * @throws UnsupportedOperationException */ @Override - public SeekStatus seek(long ord) throws IOException { + public SeekStatus seekCeil(BytesRef term, boolean useCache) throws IOException { + throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); + } + + /** This enum does not support seeking! + * @throws UnsupportedOperationException + */ + @Override + public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } @@ -166,7 +174,7 @@ * @throws UnsupportedOperationException */ @Override - public void seek(BytesRef term, TermState state) throws IOException { + public void seekExact(BytesRef term, TermState state) throws IOException { throw new UnsupportedOperationException(getClass().getName()+" does not support seeking"); } @@ -189,7 +197,7 @@ final BytesRef t = nextSeekTerm(actualTerm); // Make sure we always seek forward: assert actualTerm == null || t == null || getComparator().compare(t, actualTerm) > 0: "curTerm=" + actualTerm + " seekTerm=" + t; - if (t == null || tenum.seek(t, false) == SeekStatus.END) { + if (t == null || tenum.seekCeil(t, false) == SeekStatus.END) { // no more terms to seek to or enum exhausted return null; } Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -263,8 +263,8 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { - actualEnum.seek(term, state); + public void seekExact(BytesRef term, TermState state) throws IOException { + actualEnum.seekExact(term, state); } @Override @@ -283,13 +283,18 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { - return actualEnum.seek(text, useCache); + public boolean seekExact(BytesRef text, boolean useCache) throws IOException { + return actualEnum.seekExact(text, useCache); + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException { + return actualEnum.seekCeil(text, useCache); } @Override - public SeekStatus seek(long ord) throws IOException { - return actualEnum.seek(ord); + public void seekExact(long ord) throws IOException { + actualEnum.seekExact(ord); } @Override Index: lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java --- lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java Fri Jun 24 06:43:32 2011 -0400 @@ -241,13 +241,13 @@ } @Override - public SeekStatus seek(BytesRef text, boolean useCache) throws IOException { + public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException { int low = 1; int high = numOrd-1; while (low <= high) { int mid = (low + high) >>> 1; - seek(mid); + seekExact(mid); int cmp = term.compareTo(text); if (cmp < 0) @@ -261,19 +261,17 @@ if (low == numOrd) { return SeekStatus.END; } else { - seek(low); + seekExact(low); return SeekStatus.NOT_FOUND; } } - @Override - public SeekStatus seek(long ord) throws IOException { + public void seekExact(long ord) throws IOException { assert(ord >= 0 && ord <= numOrd); // TODO: if gap is small, could iterate from current position? Or let user decide that? currentBlockNumber = bytes.fillAndGetIndex(term, termOrdToBytesOffset.get((int)ord)); end = blockEnds[currentBlockNumber]; currentOrd = (int)ord; - return SeekStatus.FOUND; } @Override @@ -341,9 +339,9 @@ } @Override - public void seek(BytesRef term, TermState state) throws IOException { + public void seekExact(BytesRef term, TermState state) throws IOException { assert state != null && state instanceof OrdTermState; - this.seek(((OrdTermState)state).ord); + this.seekExact(((OrdTermState)state).ord); } @Override Index: lucene/src/java/org/apache/lucene/util/PerReaderTermState.java --- lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/PerReaderTermState.java Fri Jun 24 06:43:32 2011 -0400 @@ -90,7 +90,7 @@ final Terms terms = fields.terms(field); if (terms != null) { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! - if (SeekStatus.FOUND == termsEnum.seek(bytes, cache)) { + if (termsEnum.seekExact(bytes, cache)) { final TermState termState = termsEnum.termState(); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); } Index: lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java --- lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/BytesRefFSTEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -71,6 +71,21 @@ return setResult(); } + /** Seeks to exactly this term, returning null if the term + * doesn't exist. This is faster than using {@link + * #seekFloor} or {@link #seekCeil} because it + * short-circuits as soon the match is not found. */ + public InputOutput seekExact(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + if (super.doSeekExact()) { + assert upto == 1+target.length; + return setResult(); + } else { + return null; + } + } + @Override protected int getTargetLabel() { if (upto-1 == target.length) { Index: lucene/src/java/org/apache/lucene/util/fst/FST.java --- lucene/src/java/org/apache/lucene/util/fst/FST.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FST.java Fri Jun 24 06:43:32 2011 -0400 @@ -704,6 +704,12 @@ if (labelToMatch == END_LABEL) { if (follow.isFinal()) { + if (follow.target <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + arc.nextArc = follow.target; + } arc.output = follow.nextFinalOutput; arc.label = END_LABEL; return arc; Index: lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java --- lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -73,6 +73,7 @@ final int cmp = getCurrentLabel() - getTargetLabel(); if (cmp < 0) { // seek forward + //System.out.println(" seek fwd"); break; } else if (cmp > 0) { // seek backwards -- reset this arc to the first arc @@ -83,6 +84,7 @@ } upto++; } + //System.out.println(" fall through upto=" + upto); } protected void doNext() throws IOException { @@ -352,7 +354,7 @@ //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); fst.readNextRealArc(arc); assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel; - assert arc.label < targetLabel; + assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel; pushLast(); return; } @@ -410,6 +412,48 @@ } } + /** Seeks to exactly target term. */ + protected boolean doSeekExact() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + //System.out.println("FE: seek exact upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + //System.out.println("FE: after rewind upto=" + upto); + FST.Arc arc = getArc(upto-1); + int targetLabel = getTargetLabel(); + + while(true) { + //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); + final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto)); + if (nextArc == null) { + // short circuit + //upto--; + //upto = 0; + fst.readFirstTargetArc(arc, getArc(upto)); + //System.out.println(" no match upto=" + upto); + return false; + } + // Match -- recurse: + output[upto] = fst.outputs.add(output[upto-1], nextArc.output); + if (targetLabel == FST.END_LABEL) { + //System.out.println(" return found; upto=" + upto + " output=" + output[upto] + " nextArc=" + nextArc.isLast()); + return true; + } + setCurrentLabel(targetLabel); + incr(); + targetLabel = getTargetLabel(); + arc = nextArc; + } + } + private void incr() { upto++; grow(); Index: lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java --- lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/fst/IntsRefFSTEnum.java Fri Jun 24 06:43:32 2011 -0400 @@ -71,6 +71,21 @@ return setResult(); } + /** Seeks to exactly this term, returning null if the term + * doesn't exist. This is faster than using {@link + * #seekFloor} or {@link #seekCeil} because it + * short-circuits as soon the match is not found. */ + public InputOutput seekExact(IntsRef target) throws IOException { + this.target = target; + targetLength = target.length; + if (super.doSeekExact()) { + assert upto == 1+target.length; + return setResult(); + } else { + return null; + } + } + @Override protected int getTargetLabel() { if (upto-1 == target.length) { Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java Fri Jun 24 06:43:32 2011 -0400 @@ -305,7 +305,7 @@ } @Override - public SeekStatus seek(BytesRef term, boolean useCache) { + public SeekStatus seekCeil(BytesRef term, boolean useCache) { current = term.utf8ToString(); it = null; if (ramField.termToDocs.containsKey(current)) { @@ -320,7 +320,7 @@ } @Override - public SeekStatus seek(long ord) { + public void seekExact(long ord) { throw new UnsupportedOperationException(); } Index: lucene/src/test/org/apache/lucene/index/Test2BTerms.java --- lucene/src/test/org/apache/lucene/index/Test2BTerms.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/Test2BTerms.java Fri Jun 24 06:43:32 2011 -0400 @@ -255,7 +255,7 @@ final long t1 = System.currentTimeMillis(); System.out.println(" took " + (t1-t0) + " millis"); - TermsEnum.SeekStatus result = termsEnum.seek(term); + TermsEnum.SeekStatus result = termsEnum.seekCeil(term); if (result != TermsEnum.SeekStatus.FOUND) { if (result == TermsEnum.SeekStatus.END) { System.out.println(" FAILED: got END"); Index: lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java --- lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java Fri Jun 24 06:43:32 2011 -0400 @@ -632,24 +632,24 @@ // should be found exactly assertEquals(TermsEnum.SeekStatus.FOUND, - terms.seek(aaaTerm)); + terms.seekCeil(aaaTerm)); assertEquals(35, countDocs(terms.docs(null, null))); assertNull(terms.next()); // should hit end of field assertEquals(TermsEnum.SeekStatus.END, - terms.seek(new BytesRef("bbb"))); + terms.seekCeil(new BytesRef("bbb"))); assertNull(terms.next()); // should seek to aaa assertEquals(TermsEnum.SeekStatus.NOT_FOUND, - terms.seek(new BytesRef("a"))); + terms.seekCeil(new BytesRef("a"))); assertTrue(terms.term().bytesEquals(aaaTerm)); assertEquals(35, countDocs(terms.docs(null, null))); assertNull(terms.next()); assertEquals(TermsEnum.SeekStatus.FOUND, - terms.seek(aaaTerm)); + terms.seekCeil(aaaTerm)); assertEquals(35, countDocs(terms.docs(null, null))); assertNull(terms.next()); Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java --- lucene/src/test/org/apache/lucene/index/TestCodecs.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java Fri Jun 24 06:43:32 2011 -0400 @@ -266,7 +266,7 @@ assertNull(termsEnum.next()); for(int i=0;i=0;i--) { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(new BytesRef(field.terms[i].text2))); + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(field.terms[i].text2))); assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); } // Seek to each term by ord, backwards for(int i=field.terms.length-1;i>=0;i--) { try { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + termsEnum.seekExact(i); assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); } catch (UnsupportedOperationException uoe) { @@ -515,7 +516,7 @@ } // Seek to non-existent empty-string term - status = termsEnum.seek(new BytesRef("")); + status = termsEnum.seekCeil(new BytesRef("")); assertNotNull(status); //assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status); @@ -523,7 +524,7 @@ assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[0].text2))); // Test docs enum - termsEnum.seek(new BytesRef("")); + termsEnum.seekCeil(new BytesRef("")); upto = 0; do { term = field.terms[upto]; Index: lucene/src/test/org/apache/lucene/index/TestDirectoryReader.java --- lucene/src/test/org/apache/lucene/index/TestDirectoryReader.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestDirectoryReader.java Fri Jun 24 06:43:32 2011 -0400 @@ -166,14 +166,14 @@ // test mixing up TermDocs and TermEnums from different readers. TermsEnum te2 = MultiFields.getTerms(mr2, "body").iterator(); - te2.seek(new BytesRef("wow")); + te2.seekCeil(new BytesRef("wow")); DocsEnum td = MultiFields.getTermDocsEnum(mr2, MultiFields.getDeletedDocs(mr2), "body", te2.term()); TermsEnum te3 = MultiFields.getTerms(mr3, "body").iterator(); - te3.seek(new BytesRef("wow")); + te3.seekCeil(new BytesRef("wow")); td = te3.docs(MultiFields.getDeletedDocs(mr3), td); Index: lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java --- lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Tue Jun 21 04:53:16 2011 -0400 +++ lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java Fri Jun 24 06:43:32 2011 -0400 @@ -479,7 +479,7 @@ Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(); - TermsEnum.SeekStatus result = termsEnum.seek(prefixRef, false); + TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef, false); if (result != TermsEnum.SeekStatus.END) { assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), termsEnum.term().startsWith(prefixRef)); } else { @@ -494,7 +494,7 @@ if (VERBOSE) { System.out.println("TEST: TERMS:"); - te.seek(0); + te.seekExact(0); while(true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { @@ -515,7 +515,7 @@ while(true) { final int chunk = iter.read(buffer); for(int idx=0;idx seekResult; - if (random.nextBoolean()) { + if (random.nextInt(3) == 0) { + if (VERBOSE) { + System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term)); + } + seekResult = fstEnum.seekExact(term); + pos = -1; + } else if (random.nextBoolean()) { if (VERBOSE) { System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term)); } @@ -625,7 +631,12 @@ // seek to term that does exist: InputOutput pair = pairs.get(random.nextInt(pairs.size())); final IntsRefFSTEnum.InputOutput seekResult; - if (random.nextBoolean()) { + if (random.nextInt(3) == 2) { + if (VERBOSE) { + System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input)); + } + seekResult = fstEnum.seekExact(pair.input); + } else if (random.nextBoolean()) { if (VERBOSE) { System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input)); } @@ -1061,7 +1072,7 @@ System.out.println("TEST: seek " + randomTerm.utf8ToString() + " " + randomTerm); } - final TermsEnum.SeekStatus seekResult = termsEnum.seek(randomTerm); + final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm); final BytesRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm); if (seekResult == TermsEnum.SeekStatus.END) { Index: modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java --- modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java Tue Jun 21 04:53:16 2011 -0400 +++ modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java Fri Jun 24 06:43:32 2011 -0400 @@ -539,7 +539,7 @@ // we have a non-empty index, check if the term exists currentTerm.copy(word); for (TermsEnum te : termsEnums) { - if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) { + if (te.seekExact(currentTerm, false)) { continue terms; } } Index: solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java --- solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java Fri Jun 24 06:43:32 2011 -0400 @@ -342,7 +342,7 @@ Terms terms = MultiFields.getTerms(reader, currentTerm.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seek(term) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seekExact(term, true)) { result = termsEnum.docFreq(); } } Index: solr/src/java/org/apache/solr/handler/component/TermsComponent.java --- solr/src/java/org/apache/solr/handler/component/TermsComponent.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/handler/component/TermsComponent.java Fri Jun 24 06:43:32 2011 -0400 @@ -162,7 +162,7 @@ BytesRef term = null; if (lowerBytes != null) { - if (termsEnum.seek(lowerBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seekCeil(lowerBytes, true) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java --- solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java Fri Jun 24 06:43:32 2011 -0400 @@ -157,7 +157,7 @@ } if (seg.pos < seg.endTermIndex) { seg.tenum = seg.si.getTermsEnum(); - seg.tenum.seek(seg.pos); + seg.tenum.seekExact(seg.pos); seg.tempBR = seg.tenum.term(); queue.add(seg); } Index: solr/src/java/org/apache/solr/request/SimpleFacets.java --- solr/src/java/org/apache/solr/request/SimpleFacets.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/request/SimpleFacets.java Fri Jun 24 06:43:32 2011 -0400 @@ -641,7 +641,7 @@ // facet.offset when sorting by index order. if (startTermBytes != null) { - if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) { + if (termsEnum.seekCeil(startTermBytes, true) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); Index: solr/src/java/org/apache/solr/request/UnInvertedField.java --- solr/src/java/org/apache/solr/request/UnInvertedField.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/request/UnInvertedField.java Fri Jun 24 06:43:32 2011 -0400 @@ -228,13 +228,13 @@ TermsEnum te = getOrdTermsEnum(searcher.getIndexReader()); if (prefix != null && prefix.length() > 0) { final BytesRef prefixBr = new BytesRef(prefix); - if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { + if (te.seekCeil(prefixBr, true) == TermsEnum.SeekStatus.END) { startTerm = numTermsInField; } else { startTerm = (int) te.ord(); } prefixBr.append(UnicodeUtil.BIG_TERM); - if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { + if (te.seekCeil(prefixBr, true) == TermsEnum.SeekStatus.END) { endTerm = numTermsInField; } else { endTerm = (int) te.ord(); Index: solr/src/java/org/apache/solr/search/JoinQParserPlugin.java --- solr/src/java/org/apache/solr/search/JoinQParserPlugin.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/search/JoinQParserPlugin.java Fri Jun 24 06:43:32 2011 -0400 @@ -281,7 +281,7 @@ if (prefix == null) { term = termsEnum.next(); } else { - if (termsEnum.seek(prefix, true) != TermsEnum.SeekStatus.END) { + if (termsEnum.seekCeil(prefix, true) != TermsEnum.SeekStatus.END) { term = termsEnum.term(); } } @@ -366,7 +366,7 @@ if (intersects) { fromTermHits++; fromTermHitsTotalDf++; - TermsEnum.SeekStatus status = toTermsEnum.seek(term); + TermsEnum.SeekStatus status = toTermsEnum.seekCeil(term); if (status == TermsEnum.SeekStatus.END) break; if (status == TermsEnum.SeekStatus.FOUND) { toTermHits++; Index: solr/src/java/org/apache/solr/search/function/FileFloatSource.java --- solr/src/java/org/apache/solr/search/function/FileFloatSource.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/java/org/apache/solr/search/function/FileFloatSource.java Fri Jun 24 06:43:32 2011 -0400 @@ -268,7 +268,7 @@ continue; // go to next line in file.. leave values as default. } - if (termsEnum.seek(internalKey, false) != TermsEnum.SeekStatus.FOUND) { + if (!termsEnum.seekExact(internalKey, false)) { if (notFoundCount<10) { // collect first 10 not found for logging notFound.add(key); } Index: solr/src/test/org/apache/solr/request/TestFaceting.java --- solr/src/test/org/apache/solr/request/TestFaceting.java Tue Jun 21 04:53:16 2011 -0400 +++ solr/src/test/org/apache/solr/request/TestFaceting.java Fri Jun 24 06:43:32 2011 -0400 @@ -87,7 +87,7 @@ if (te == null) { br = null; } else { - TermsEnum.SeekStatus status = te.seek(new BytesRef(s)); + TermsEnum.SeekStatus status = te.seekCeil(new BytesRef(s)); if (status == TermsEnum.SeekStatus.END) { br = null; } else { @@ -103,7 +103,7 @@ // test seeking before term if (size>0) { - assertEquals(size>0, te.seek(new BytesRef("000"), true) != TermsEnum.SeekStatus.END); + assertEquals(size>0, te.seekCeil(new BytesRef("000"), true) != TermsEnum.SeekStatus.END); assertEquals(0, te.ord()); assertEquals(t(0), te.term().utf8ToString()); } @@ -113,7 +113,7 @@ for (int i=0; i