Index: lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat =================================================================== --- lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (revision 1362238) +++ lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (working copy) @@ -17,3 +17,4 @@ org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat +org.apache.lucene.codecs.memory.DirectPostingsFormat Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java (revision 1362238) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java (working copy) @@ -976,7 +976,7 @@ // Don't proceed if picked Codec is in the list of illegal ones. final String format = _TestUtil.getPostingsFormat("f"); assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!", - (format.equals("SimpleText") || format.equals("Memory"))); + (format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct"))); Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, conf); Index: lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (revision 1362238) +++ lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (working copy) @@ -132,8 +132,9 @@ public void testLazySkipping() throws IOException { final String fieldFormat = _TestUtil.getPostingsFormat(this.field); - assumeFalse("This test cannot run with Memory codec", fieldFormat.equals("Memory")); - assumeFalse("This test cannot run with SimpleText codec", fieldFormat.equals("SimpleText")); + assumeFalse("This test cannot run with Memory postings format", fieldFormat.equals("Memory")); + assumeFalse("This test cannot run with Direct postings format", fieldFormat.equals("Direct")); + assumeFalse("This test cannot run with SimpleText postings format", fieldFormat.equals("SimpleText")); // test whether only the minimum amount of seeks() // are performed Index: lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java (working copy) @@ -0,0 +1,2195 @@ +package org.apache.lucene.codecs.memory; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.OrdTermState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; + +// NOTE: 2B max term bytes + +// perf +// - build depth-N prefix hash? +// - or: longer dense skip lists than just next byte? + +// nocommit explain how this does impl ord... +// nocommit random test; mixup minSkipCount too... + +// nocommit can we pass/wrap arbitrary PF? + +// nocommit if io context is merge don't cache? + +/** Wraps {@link Lucene40PostingsFormat} format for on-disk + * storage, but then at read time loads and stores all + * terms & postings directly in RAM as java objects. */ + +public class DirectPostingsFormat extends PostingsFormat { + + private final int minSkipCount; + private final int lowFreqCutoff; + + // nocommit randomize these during tests...: + private final static int DEFAULT_MIN_SKIP_COUNT = 8; + private final static int DEFAULT_LOW_FREQ_CUTOFF = 32; + + private static final boolean DEBUG = false; + private static final boolean DEBUG2 = false; + + public DirectPostingsFormat() { + this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF); + } + + public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) { + super("Direct"); + this.minSkipCount = minSkipCount; + this.lowFreqCutoff = lowFreqCutoff; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return PostingsFormat.forName("Lucene40").fieldsConsumer(state); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + FieldsProducer postings = PostingsFormat.forName("Lucene40").fieldsProducer(state); + FieldsProducer loadedPostings; + try { + loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff); + } finally { + postings.close(); + } + return loadedPostings; + } + + private static final class DirectFields extends FieldsProducer { + private final Map fields = new TreeMap(); + + public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException { + FieldsEnum fieldsEnum = fields.iterator(); + String field; + while ((field = fieldsEnum.next()) != null) { + this.fields.put(field, new DirectField(state, field, fieldsEnum.terms(), minSkipCount, lowFreqCutoff)); + } + } + + @Override + public FieldsEnum iterator() { + + final Iterator> iter = fields.entrySet().iterator(); + + return new FieldsEnum() { + Map.Entry current; + + @Override + public String next() { + if (iter.hasNext()) { + current = iter.next(); + return current.getKey(); + } else { + return null; + } + } + + @Override + public Terms terms() { + return current.getValue(); + } + }; + } + + @Override + public Terms terms(String field) { + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + @Override + public long getUniqueTermCount() { + long numTerms = 0; + for(DirectField field : fields.values()) { + numTerms += field.terms.length; + } + return numTerms; + } + + @Override + public void close() { + } + } + + private final static class DirectField extends Terms { + + private static abstract class TermAndSkip { + public int[] skips; + } + + private static final class LowFreqTerm extends TermAndSkip { + public final int[] postings; + public final byte[] payloads; + public final int docFreq; + public final int totalTermFreq; + + public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) { + this.postings = postings; + this.payloads = payloads; + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + } + } + + // nocommit specialize into prx/no-prx/no-frq? + private static final class HighFreqTerm extends TermAndSkip { + public final long totalTermFreq; + public final int[] docIDs; + public final int[] freqs; + public final int[][] positions; + public final byte[][][] payloads; + + public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) { + this.docIDs = docIDs; + this.freqs = freqs; + this.positions = positions; + this.payloads = payloads; + this.totalTermFreq = totalTermFreq; + } + } + + // nocommit may need an index (hashmap) of first N bytes + // too? + + private final byte[] termBytes; + private final int[] termOffsets; + + private final int[] skips; + private final int[] skipOffsets; + + private final TermAndSkip[] terms; + private final boolean hasFreq; + private final boolean hasPos; + private final boolean hasOffsets; + private final boolean hasPayloads; + private final long sumTotalTermFreq; + private final int docCount; + private final long sumDocFreq; + + // nocommit only used on init ... pull into builder? + private int count; + private int[] sameCounts = new int[10]; + private final int minSkipCount; + + private int skipCount; + + private final static class IntArrayWriter { + private int[] ints = new int[10]; + private int upto; + + public void add(int value) { + if (ints.length == upto) { + ints = ArrayUtil.grow(ints); + } + ints[upto++] = value; + } + + public int[] get() { + final int[] arr = new int[upto]; + System.arraycopy(ints, 0, arr, 0, upto); + upto = 0; + return arr; + } + } + + public DirectField(SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff) throws IOException { + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + + sumTotalTermFreq = termsIn.getSumTotalTermFreq(); + sumDocFreq = termsIn.getSumDocFreq(); + docCount = termsIn.getDocCount(); + + final int numTerms = (int) termsIn.size(); + if (numTerms == -1) { + throw new IllegalArgumentException("codec does not provide Terms.size()"); + } + terms = new TermAndSkip[numTerms]; + termOffsets = new int[1+numTerms]; + + byte[] termBytes = new byte[1024]; + + // nocommit pull into a builder? only need at build time: + this.minSkipCount = minSkipCount; + + hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_ONLY) > 0; + hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0; + hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0; + hasPayloads = fieldInfo.hasPayloads(); + + BytesRef term; + DocsEnum docsEnum = null; + DocsAndPositionsEnum docsAndPositionsEnum = null; + final TermsEnum termsEnum = termsIn.iterator(null); + int termOffset = 0; + + final IntArrayWriter scratch = new IntArrayWriter(); + + // Used for payloads, if any: + final RAMOutputStream ros = new RAMOutputStream(); + + if (DEBUG) { + System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads); + } + + while ((term = termsEnum.next()) != null) { + final int docFreq = termsEnum.docFreq(); + final long totalTermFreq = termsEnum.totalTermFreq(); + if (false && DEBUG) { + System.out.println(" ADD term=" + term.utf8ToString() + " dF=" + docFreq); + } + + termOffsets[count] = termOffset; + + if (termBytes.length < (termOffset + term.length)) { + termBytes = ArrayUtil.grow(termBytes, termOffset + term.length); + } + System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length); + termOffset += term.length; + termOffsets[count+1] = termOffset; + + if (hasPos) { + docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum, hasOffsets); + } else { + docsEnum = termsEnum.docs(null, docsEnum, hasFreq); + } + + final TermAndSkip ent; + + final DocsEnum docsEnum2; + if (hasPos) { + docsEnum2 = docsAndPositionsEnum; + } else { + docsEnum2 = docsEnum; + } + + int docID; + + if (docFreq <= lowFreqCutoff) { + + ros.reset(); + + // Pack postings for low-freq terms into a single int[]: + while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + scratch.add(docID); + if (hasFreq) { + final int freq = docsEnum2.freq(); + scratch.add(freq); + if (hasPos) { + for(int pos=0;pos 0) { + final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd-1]; + final int limit = Math.min(termLength, lastTermLength); + + int lastTermOffset = termOffsets[termOrd-1]; + int termOffset = termOffsets[termOrd]; + + int i = 0; + for(;i= minSkipCount) { + // Go back and add a skip pointer: + saveSkip(termOrd, sameCounts[i]); + } + sameCounts[i] = 1; + } + break; + } + } + + for(;i= minSkipCount) { + // Go back and add a skip pointer: + saveSkip(termOrd, sameCounts[i]); + } + sameCounts[i] = 0; + } + for(int j=limit;j 1) { + for(int pos=0;pos getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + private final class DirectTermsEnum extends TermsEnum { + + private final BytesRef scratch = new BytesRef(); + private int termOrd; + + // nocommit silliness + private final TermAndSkip[] termsSav; + + public DirectTermsEnum() { + this.termsSav = terms; + } + + private BytesRef setTerm() { + scratch.bytes = termBytes; + scratch.offset = termOffsets[termOrd]; + scratch.length = termOffsets[termOrd+1] - termOffsets[termOrd]; + return scratch; + } + + public void reset() { + termOrd = -1; + } + + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public BytesRef next() { + termOrd++; + if (termOrd < terms.length) { + return setTerm(); + } else { + return null; + } + } + + @Override + public TermState termState() { + OrdTermState state = new OrdTermState(); + state.ord = termOrd; + return state; + } + + // If non-negative, exact match; else, -ord-1, where ord + // is where you would insert the term. + private int findTerm(BytesRef term) { + + // Just do binary search: should be (constant factor) + // faster than using the skip list: + int low = 0; + int high = terms.length-1; + + while (low <= high) { + int mid = (low + high) >>> 1; + int cmp = compare(mid, term); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + + return -(low + 1); // key not found. + } + + @Override + public SeekStatus seekCeil(BytesRef term, boolean useCache) { + // nocommit can we leverage current state? use the + // skip pointers? + final int ord = findTerm(term); + if (DEBUG) { + System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord); + } + if (ord >= 0) { + termOrd = ord; + setTerm(); + return SeekStatus.FOUND; + } else if (ord == -terms.length-1) { + // nocommit must i return null from term() after this....? + return SeekStatus.END; + } else { + termOrd = -ord - 1; + setTerm(); + return SeekStatus.NOT_FOUND; + } + } + + // nocommit seekExact? leverage current state? + + @Override + public boolean seekExact(BytesRef term, boolean useCache) { + // nocommit can we leverage current state? + final int ord = findTerm(term); + if (DEBUG) { + System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord); + } + if (ord >= 0) { + termOrd = ord; + setTerm(); + return true; + } else { + return false; + } + } + + @Override + public void seekExact(long ord) { + termOrd = (int) ord; + setTerm(); + } + + @Override + public void seekExact(BytesRef term, TermState state) throws IOException { + termOrd = (int) ((OrdTermState) state).ord; + setTerm(); + assert term.equals(scratch); + } + + @Override + public BytesRef term() { + return scratch; + } + + @Override + public long ord() { + return termOrd; + } + + @Override + public int docFreq() { + if (terms[termOrd] instanceof LowFreqTerm) { + return ((LowFreqTerm) terms[termOrd]).docFreq; + } else { + return ((HighFreqTerm) terms[termOrd]).docIDs.length; + } + } + + @Override + public long totalTermFreq() { + if (terms[termOrd] instanceof LowFreqTerm) { + return ((LowFreqTerm) terms[termOrd]).totalTermFreq; + } else { + return ((HighFreqTerm) terms[termOrd]).totalTermFreq; + } + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) { + if (needsFreqs && !hasFreq) { + return null; + } + + // nocommit need tricky reuse logic so we can pair up + // low/high freq enums... + + if (terms[termOrd] instanceof LowFreqTerm) { + final int[] postings = ((LowFreqTerm) terms[termOrd]).postings; + if (hasFreq) { + if (hasPos) { + int posLen; + if (hasOffsets) { + posLen = 3; + } else { + posLen = 1; + } + if (hasPayloads) { + posLen++; + } + return new LowFreqDocsEnum(liveDocs, posLen).reset(postings); + } else { + return new LowFreqDocsEnumNoPos(liveDocs).reset(postings); + } + } else { + return new LowFreqDocsEnumNoTF(liveDocs).reset(postings); + } + } else { + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; + //System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs"); + return new HighFreqDocsEnum(liveDocs).reset(term.docIDs, term.freqs); + } + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { + if (!hasPos) { + return null; + } + if (needsOffsets && !hasOffsets) { + return null; + } + + // nocommit need tricky reuse logic so we can pair up + // low/high freq enums... + + if (terms[termOrd] instanceof LowFreqTerm) { + final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]); + final int[] postings = term.postings; + final byte[] payloads = term.payloads; + return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads); + } else { + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; + return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads); + } + } + } + + private final class DirectIntersectTermsEnum extends TermsEnum { + private final RunAutomaton runAutomaton; + private final CompiledAutomaton compiledAutomaton; + private int termOrd; + private final BytesRef scratch = new BytesRef(); + + private final class State { + int changeOrd; + int state; + Transition[] transitions; + int transitionUpto; + int transitionMax; + int transitionMin; + } + + private State[] states; + private int stateUpto; + + public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) { + runAutomaton = compiled.runAutomaton; + compiledAutomaton = compiled; + termOrd = -1; + states = new State[1]; + states[0] = new State(); + states[0].changeOrd = terms.length; + states[0].state = runAutomaton.getInitialState(); + states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state]; + // nocommit if no transitions... we are done? + states[0].transitionUpto = -1; + states[0].transitionMax = -1; + + //System.out.println("IE.init startTerm=" + startTerm); + + if (startTerm != null) { + int skipUpto = 0; + if (startTerm.length == 0) { + if (terms.length > 0 && termOffsets[1] == 0) { + termOrd = 0; + } + } else { + termOrd++; + + nextLabel: + for(int i=0;i states[i].transitionMax) { + states[i].transitionUpto++; + assert states[i].transitionUpto < states[i].transitions.length; + states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin(); + states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax(); + assert states[i].transitionMin >= 0; + assert states[i].transitionMin <= 255; + assert states[i].transitionMax >= 0; + assert states[i].transitionMax <= 255; + } + + // Skip forwards until we find a term matching + // the label at this position: + while (termOrd < terms.length) { + final int skipOffset = skipOffsets[termOrd]; + final int numSkips = skipOffsets[termOrd+1] - skipOffset; + final int termOffset = termOffsets[termOrd]; + final int termLength = termOffsets[1+termOrd] - termOffset; + + if (DEBUG) { + //System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i); + } + + if (termOrd == states[stateUpto].changeOrd) { + if (DEBUG) { + System.out.println(" end push return"); + } + stateUpto--; + termOrd--; + return; + } + + if (termLength == i) { + termOrd++; + skipUpto = 0; + if (DEBUG) { + System.out.println(" term too short; next term"); + } + } else if (label < (termBytes[termOffset+i] & 0xFF)) { + termOrd--; + if (DEBUG) { + System.out.println(" no match; already beyond; return termOrd=" + termOrd); + } + stateUpto -= skipUpto; + assert stateUpto >= 0; + return; + } else if (label == (termBytes[termOffset+i] & 0xFF)) { + if (DEBUG) { + System.out.println(" label[" + i + "] matches"); + } + if (skipUpto < numSkips) { + grow(); + + final int nextState = runAutomaton.step(states[stateUpto].state, label); + + // Automaton is required to accept startTerm: + assert nextState != -1; + + stateUpto++; + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; + states[stateUpto].state = nextState; + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionMax = -1; + //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); + + if (DEBUG) { + System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd); + } + + // Match next label at this same term: + continue nextLabel; + } else { + if (DEBUG) { + System.out.println(" linear scan"); + } + // Index exhausted: just scan now (the + // number of scans required will be less + // than the minSkipCount): + final int startTermOrd = termOrd; + while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) { + assert termOrd == startTermOrd || skipOffsets[termOrd] == skipOffsets[termOrd+1]; + termOrd++; + } + assert termOrd - startTermOrd < minSkipCount; + termOrd--; + stateUpto -= skipUpto; + if (DEBUG) { + System.out.println(" end termOrd=" + termOrd); + } + return; + } + } else { + if (skipUpto < numSkips) { + termOrd = skips[skipOffset + skipUpto]; + if (DEBUG) { + System.out.println(" no match; skip to termOrd=" + termOrd); + } + } else { + if (DEBUG) { + System.out.println(" no match; next term"); + } + termOrd++; + } + skipUpto = 0; + } + } + + // startTerm is >= last term so enum will not + // return any terms: + termOrd--; + if (DEBUG) { + System.out.println(" beyond end; no terms will match"); + } + return; + } + } + + final int termOffset = termOffsets[termOrd]; + final int termLen = termOffsets[1+termOrd] - termOffset; + + if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) { + stateUpto -= skipUpto; + termOrd--; + } + if (DEBUG) { + System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto); + } + } + } + + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + private void grow() { + if (states.length == 1+stateUpto) { + final State[] newStates = new State[states.length+1]; + System.arraycopy(states, 0, newStates, 0, states.length); + newStates[states.length] = new State(); + states = newStates; + } + } + + @Override + public BytesRef next() { + if (DEBUG || DEBUG2) { + System.out.println("\nIE.next"); + } + + termOrd++; + int skipUpto = 0; + + // nocommit + int incCount = 0; + + if (termOrd == 0 && termOffsets[1] == 0) { + // Special-case empty string: + assert stateUpto == 0; + if (DEBUG) { + System.out.println(" visit empty string"); + } + if (runAutomaton.isAccept(states[0].state)) { + scratch.bytes = termBytes; + scratch.offset = 0; + scratch.length = 0; + return scratch; + } + termOrd++; + } + + nextTerm: + + while (true) { + if (DEBUG) { + System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto); + } + if (termOrd == terms.length) { + if (DEBUG) { + System.out.println(" return END"); + } + return null; + } + + final State state = states[stateUpto]; + if (termOrd == state.changeOrd) { + // Pop: + if (DEBUG) { + System.out.println(" pop stateUpto=" + stateUpto); + } + stateUpto--; + if (DEBUG2) { + /* + try { + //System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString()); + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); + } catch (ArrayIndexOutOfBoundsException aioobe) { + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); + } + */ + } + + continue; + } + + final int termOffset = termOffsets[termOrd]; + final int termLength = termOffsets[termOrd+1] - termOffset; + final int skipOffset = skipOffsets[termOrd]; + final int numSkips = skipOffsets[termOrd+1] - skipOffset; + + if (DEBUG) { + //System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips)); + } + + assert termOrd < state.changeOrd; + + assert stateUpto <= termLength: "term.length=" + termLength + "; stateUpto=" + stateUpto; + final int label = termBytes[termOffset+stateUpto] & 0xFF; + + while (label > state.transitionMax) { + //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); + state.transitionUpto++; + if (state.transitionUpto == state.transitions.length) { + // We've exhausted transitions leaving this + // state; force pop+next/skip now: + //System.out.println("forcepop: stateUpto=" + stateUpto); + if (stateUpto == 0) { + termOrd = terms.length; + return null; + } else { + assert state.changeOrd > termOrd; + if (DEBUG2) { + System.out.println(" jumpend " + (state.changeOrd - termOrd)); + } + //System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd); + termOrd = states[stateUpto].changeOrd; + incCount = 0; + skipUpto = 0; + stateUpto--; + } + continue nextTerm; + } + assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length; + state.transitionMin = state.transitions[state.transitionUpto].getMin(); + state.transitionMax = state.transitions[state.transitionUpto].getMax(); + assert state.transitionMin >= 0; + assert state.transitionMin <= 255; + assert state.transitionMax >= 0; + assert state.transitionMax <= 255; + } + + if (DEBUG2) { + /* + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " + + (char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " + + Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); + */ + } + + final int targetLabel = state.transitionMin; + + if ((termBytes[termOffset+stateUpto] & 0xFF) < targetLabel) { + if (DEBUG2) { + System.out.println(" do bin search"); + } + int startTermOrd = termOrd; + int low = termOrd+1; + int high = state.changeOrd-1; + while (true) { + if (low > high) { + // Label not found + termOrd = low; + if (DEBUG2) { + System.out.println(" advanced by " + (termOrd - startTermOrd)); + } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + continue nextTerm; + } + int mid = (low + high) >>> 1; + int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel; + if (DEBUG2) { + System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid); + } + if (cmp < 0) { + low = mid+1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Label found; walk backwards to first + // occurrence: + while (mid > termOrd && (termBytes[termOffsets[mid-1] + stateUpto] & 0xFF) == targetLabel) { + mid--; + } + termOrd = mid; + if (DEBUG2) { + System.out.println(" advanced by " + (termOrd - startTermOrd)); + } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + continue nextTerm; + } + } + } + + int nextState = runAutomaton.step(states[stateUpto].state, label); + + if (nextState == -1) { + // Skip + if (DEBUG) { + System.out.println(" automaton doesn't accept; skip"); + } + if (skipUpto < numSkips) { + if (DEBUG2) { + System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd)); + } + termOrd = skips[skipOffset+skipUpto]; + incCount = 0; + } else { + if (DEBUG2) { + System.out.println(" inc " + (incCount++)); + } + termOrd++; + } + skipUpto = 0; + } else if (skipUpto < numSkips) { + // Push: + if (DEBUG) { + System.out.println(" push"); + } + /* + if (DEBUG2) { + try { + //System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString()); + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); + } catch (ArrayIndexOutOfBoundsException aioobe) { + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); + } + } + */ + + grow(); + stateUpto++; + states[stateUpto].state = nextState; + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; + // nocommit if transitions.length is 0 don't push...? + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionMax = -1; + + if (stateUpto == termLength) { + if (DEBUG) { + System.out.println(" term ends after push"); + } + if (runAutomaton.isAccept(nextState)) { + if (DEBUG) { + System.out.println(" automaton accepts: return"); + } + scratch.bytes = termBytes; + scratch.offset = termOffsets[termOrd]; + scratch.length = termOffsets[1+termOrd] - scratch.offset; + if (DEBUG2) { + System.out.println(" ret " + scratch.utf8ToString()); + } + return scratch; + } else { + if (DEBUG) { + System.out.println(" automaton rejects: nextTerm"); + } + if (DEBUG2) { + System.out.println(" inc " + (incCount++)); + } + termOrd++; + skipUpto = 0; + } + } + } else { + // Run the non-indexed tail of this term: + + // nocommit add assert that we don't inc too + // many times + + // nocommit check common suffix here: + + if (compiledAutomaton.commonSuffixRef != null) { + //System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString()); + assert compiledAutomaton.commonSuffixRef.offset == 0; + if (termLength < compiledAutomaton.commonSuffixRef.length) { + termOrd++; + skipUpto = 0; + continue nextTerm; + } + int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length; + for(int suffix=0;suffix 0; + return postings[upto]; + } + } else { + while (upto < postings.length) { + freq = postings[upto+1]; + assert freq > 0; + if (liveDocs.get(postings[upto])) { + return postings[upto]; + } + upto += 2 + freq*posMult; + } + } + return NO_MORE_DOCS; + } + + @Override + public int docID() { + // nocommit store docID member? + if (upto < 0) { + return -1; + } else if (upto < postings.length) { + return postings[upto]; + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int freq() { + // nocomit can't I do postings[upto+1]? + return freq; + } + + @Override + public int advance(int target) { + // Linear scan, but this is low-freq term so it won't + // be costly: + while(nextDoc() < target) { + } + return docID(); + } + } + + private final static class LowFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { + private int[] postings; + private final Bits liveDocs; + private final int posMult; + private final boolean hasOffsets; + private final boolean hasPayloads; + private final BytesRef payload = new BytesRef(); + private int upto; + private int docID; + private int freq; + private int skipPositions; + private int startOffset; + private int endOffset; + private int payloadOffset; + private int payloadLength; + + public LowFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets, boolean hasPayloads) { + this.liveDocs = liveDocs; + this.hasOffsets = hasOffsets; + this.hasPayloads = hasPayloads; + if (hasOffsets) { + if (hasPayloads) { + posMult = 4; + } else { + posMult = 3; + } + } else if (hasPayloads) { + posMult = 2; + } else { + posMult = 1; + } + } + + public DocsAndPositionsEnum reset(int[] postings, byte[] payloadBytes) { + this.postings = postings; + upto = 0; + skipPositions = 0; + startOffset = -1; + endOffset = -1; + docID = -1; + payloadLength = 0; + payload.bytes = payloadBytes; + return this; + } + + @Override + public int nextDoc() { + if (hasPayloads) { + for(int i=0;i 0; + skipPositions--; + final int pos = postings[upto++]; + if (hasOffsets) { + startOffset = postings[upto++]; + endOffset = postings[upto++]; + } + if (hasPayloads) { + payloadLength = postings[upto++]; + payload.offset = payloadOffset; + payloadOffset += payloadLength; + } + return pos; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public int advance(int target) { + // Linear scan, but this is low-freq term so it won't + // be costly: + while (nextDoc() < target) { + } + return docID; + } + + @Override + public boolean hasPayload() { + return payloadLength > 0; + } + + @Override + public BytesRef getPayload() { + if (payloadLength > 0) { + payload.length = payloadLength; + payloadLength = 0; + return payload; + } else { + return null; + } + } + } + + // Docs + freqs: + public final static class HighFreqDocsEnum extends DocsEnum { + private int[] docIDs; + private int[] freqs; + private final Bits liveDocs; + private int upto; + private int docID = -1; + + public HighFreqDocsEnum(Bits liveDocs) { + this.liveDocs = liveDocs; + } + + public int[] getDocIDs() { + return docIDs; + } + + public int[] getFreqs() { + return freqs; + } + + public DocsEnum reset(int[] docIDs, int[] freqs) { + this.docIDs = docIDs; + this.freqs = freqs; + upto = -1; + return this; + } + + @Override + public int nextDoc() { + upto++; + if (liveDocs == null) { + try { + return docID = docIDs[upto]; + } catch (ArrayIndexOutOfBoundsException e) { + } + } else { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + return docID = docIDs[upto]; + } + upto++; + } + } + return docID = NO_MORE_DOCS; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + return freqs[upto]; + } + + @Override + public int advance(int target) { + /* + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); + if (index < 0) { + upto = -index - 1; + } else { + upto = index; + } + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } else { + return docID = docIDs[upto]; + } + */ + + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); + if (DEBUG) { + System.out.println("advance target=" + target + " len=" + docIDs.length); + } + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + + // First "grow" outwards, since most advances are to + // nearby docs: + int inc = 10; + int nextUpto = upto+10; + int low; + int high; + while (true) { + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); + if (nextUpto >= docIDs.length) { + low = nextUpto-inc; + high = docIDs.length-1; + break; + } + //System.out.println(" docID=" + docIDs[nextUpto]); + + if (target <= docIDs[nextUpto]) { + low = nextUpto-inc; + high = nextUpto; + break; + } + inc *= 2; + nextUpto += inc; + } + + // Now do normal binary search + //System.out.println(" after fwd: low=" + low + " high=" + high); + + while (true) { + + if (low > high) { + // Not exactly found + //System.out.println(" break: no match"); + upto = low; + break; + } + + int mid = (low + high) >>> 1; + int cmp = docIDs[mid] - target; + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Found target + upto = mid; + //System.out.println(" break: match"); + break; + } + } + + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); + + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + //System.out.println(" return END"); + return docID = NO_MORE_DOCS; + } else { + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); + return docID = docIDs[upto]; + } + } + } + + // nocommit specialize offsets and not + public final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { + // nocommit maybe append NO_MORE_DOCS on end? + // nocommit make these final...? + private int[] docIDs; + private int[] freqs; + private int[][] positions; + private byte[][][] payloads; + private final Bits liveDocs; + private final boolean hasOffsets; + private final int posJump; + private int upto; + private int docID = -1; + private int posUpto; + private boolean gotPayload; + private int[] curPositions; + + public HighFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets) { + this.liveDocs = liveDocs; + this.hasOffsets = hasOffsets; + posJump = hasOffsets ? 3 : 1; + } + + public int[] getDocIDs() { + return docIDs; + } + + public int[][] getPositions() { + return positions; + } + + public int getPosJump() { + return posJump; + } + + public Bits getLiveDocs() { + return liveDocs; + } + + public DocsAndPositionsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) { + this.docIDs = docIDs; + this.freqs = freqs; + this.positions = positions; + this.payloads = payloads; + upto = -1; + return this; + } + + @Override + public int nextDoc() { + upto++; + if (liveDocs == null) { + if (upto < docIDs.length) { + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } else { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + upto++; + } + } + + return docID = NO_MORE_DOCS; + } + + @Override + public int freq() { + return freqs[upto]; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextPosition() { + posUpto += posJump; + gotPayload = false; + return curPositions[posUpto]; + } + + @Override + public int startOffset() { + if (hasOffsets) { + return curPositions[posUpto+1]; + } else { + return -1; + } + } + + @Override + public int endOffset() { + if (hasOffsets) { + return curPositions[posUpto+2]; + } else { + return -1; + } + } + + @Override + public int advance(int target) { + + /* + upto++; + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); + if (index < 0) { + upto = -index - 1; + } else { + upto = index; + } + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + posUpto = hasOffsets ? -3 : -1; + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } else { + return docID(); + } + */ + + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); + if (DEBUG) { + System.out.println("advance target=" + target + " len=" + docIDs.length); + } + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + + // First "grow" outwards, since most advances are to + // nearby docs: + int inc = 10; + int nextUpto = upto+10; + int low; + int high; + while (true) { + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); + if (nextUpto >= docIDs.length) { + low = nextUpto-inc; + high = docIDs.length-1; + break; + } + //System.out.println(" docID=" + docIDs[nextUpto]); + + if (target <= docIDs[nextUpto]) { + low = nextUpto-inc; + high = nextUpto; + break; + } + inc *= 2; + nextUpto += inc; + } + + // Now do normal binary search + //System.out.println(" after fwd: low=" + low + " high=" + high); + + while (true) { + + if (low > high) { + // Not exactly found + //System.out.println(" break: no match"); + upto = low; + break; + } + + int mid = (low + high) >>> 1; + int cmp = docIDs[mid] - target; + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Found target + upto = mid; + //System.out.println(" break: match"); + break; + } + } + + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); + + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + //System.out.println(" return END"); + return docID = NO_MORE_DOCS; + } else { + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } + + @Override + public boolean hasPayload() { + return !gotPayload && payloads != null && payloads[upto][posUpto/(hasOffsets ? 3 : 1)] != null; + } + + private final BytesRef payload = new BytesRef(); + + @Override + public BytesRef getPayload() { + final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)]; + payload.bytes = payloadBytes; + payload.length = payloadBytes.length; + payload.offset = 0; + gotPayload = true; + return payload; + } + } +} Index: lucene/core/src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/TermQuery.java (revision 1362238) +++ lucene/core/src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -20,9 +20,10 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.codecs.memory.DirectPostingsFormat.HighFreqDocsEnum; +import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; @@ -85,7 +86,12 @@ } DocsEnum docs = termsEnum.docs(acceptDocs, null, true); if (docs != null) { - return new TermScorer(this, docs, createDocScorer(context)); + // nocommit + if (false && docs instanceof HighFreqDocsEnum && acceptDocs == null) { + return new DirectTermScorer(this, (HighFreqDocsEnum) docs, createDocScorer(context)); + } else { + return new TermScorer(this, docs, createDocScorer(context)); + } } else { // Index does not store freq info docs = termsEnum.docs(acceptDocs, null, false); Index: lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (revision 1362238) +++ lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -22,10 +22,11 @@ import java.util.Arrays; import java.util.Set; +import org.apache.lucene.codecs.memory.DirectPostingsFormat.HighFreqDocsAndPositionsEnum; +import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; @@ -255,7 +256,7 @@ // Reuse single TermsEnum below: final TermsEnum te = fieldTerms.iterator(null); - + boolean allDirect = true; for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); final TermState state = states[i].get(context.ord); @@ -274,6 +275,8 @@ throw new IllegalStateException("field \"" + t.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.text() + ")"); } postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t); + //System.out.println("PE=" + postingsEnum); + allDirect = allDirect && (postingsEnum instanceof HighFreqDocsAndPositionsEnum); } // sort by increasing docFreq order @@ -282,11 +285,17 @@ } if (slop == 0) { // optimize exact case - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactSimScorer(stats, context)); - if (s.noDocs) { - return null; + // nocommit + if (false && allDirect) { + return new DirectExactPhraseScorer(this, postingsFreqs, similarity.exactSimScorer(stats, context)); } else { - return s; + //System.out.println("not direct phrase"); + ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactSimScorer(stats, context)); + if (s.noDocs) { + return null; + } else { + return s; + } } } else { return Index: lucene/core/src/java/org/apache/lucene/search/DirectExactPhraseScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/DirectExactPhraseScorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/DirectExactPhraseScorer.java (working copy) @@ -0,0 +1,417 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.codecs.memory.DirectPostingsFormat.HighFreqDocsAndPositionsEnum; +import org.apache.lucene.index.*; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.Bits; + +final class DirectExactPhraseScorer extends Scorer { + private final int endMinus1; + + private final static int CHUNK = 4096; + + private int gen; + private final int[] counts = new int[CHUNK]; + private final int[] gens = new int[CHUNK]; + + private final static class ChunkState { + final int[] docIDs; + final int[][] positions; + final int offset; + final boolean useAdvance; + final Bits liveDocs; + final int posJump; + int upto = -1; + int[] curPositions; + int posUpto; + int pos; + int lastPos; + int docID = -1; + + public ChunkState(HighFreqDocsAndPositionsEnum posEnum, int offset, boolean useAdvance) { + this.docIDs = posEnum.getDocIDs(); + this.positions = posEnum.getPositions(); + this.offset = offset; + this.liveDocs = posEnum.getLiveDocs(); + this.useAdvance = useAdvance; + this.posJump = posEnum.getPosJump(); + } + + public int nextDoc() { + upto++; + if (liveDocs == null) { + if (upto < docIDs.length) { + posUpto = 0; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } else { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + posUpto = 0; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + upto++; + } + } + + return docID = NO_MORE_DOCS; + } + + public int advance(int target) { + // NOTE: copied from DirectPostingsFormat + + upto++; + + // First "grow" outwards, since most advances are to + // nearby docs: + int inc = 10; + int nextUpto = upto+10; + int low; + int high; + while (true) { + if (nextUpto >= docIDs.length) { + low = nextUpto-inc; + high = docIDs.length-1; + break; + } + + if (target <= docIDs[nextUpto]) { + low = nextUpto-inc; + high = nextUpto; + break; + } + inc *= 2; + nextUpto += inc; + } + + // Now do normal binary search + + while (true) { + + if (low > high) { + // Not exactly found + //System.out.println(" break: no match"); + upto = low; + break; + } + + int mid = (low + high) >>> 1; + int cmp = docIDs[mid] - target; + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Found target + upto = mid; + //System.out.println(" break: match"); + break; + } + } + + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); + + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + //System.out.println(" return END"); + return docID = NO_MORE_DOCS; + } else { + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); + posUpto = 0; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } + } + + private final ChunkState[] chunkStates; + + private int docID = -1; + private int freq; + + private final Similarity.ExactSimScorer docScorer; + + DirectExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, + Similarity.ExactSimScorer docScorer) throws IOException { + super(weight); + this.docScorer = docScorer; + //System.out.println("use direct exact"); + + chunkStates = new ChunkState[postings.length]; + + endMinus1 = postings.length-1; + + for(int i=0;i 1/5th) rarer than + // the first term, then we just use .nextDoc() when + // ANDing. This buys ~15% gain for phrases where + // freq of rarest 2 terms is close: + final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq; + chunkStates[i] = new ChunkState((HighFreqDocsAndPositionsEnum) postings[i].postings, -postings[i].position, useAdvance); + } + } + + @Override + public int nextDoc() throws IOException { + while(true) { + + // first (rarest) term + final int doc = chunkStates[0].nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + return docID = doc; + } + + // not-first terms + int i = 1; + while(i < chunkStates.length) { + final ChunkState cs = chunkStates[i]; + int doc2 = cs.docID; + if (cs.useAdvance) { + if (doc2 < doc) { + doc2 = cs.advance(doc); + } + } else { + int iter = 0; + while (doc2 < doc) { + // safety net -- fallback to .advance if we've + // done too many .nextDocs + if (++iter == 50) { + doc2 = cs.advance(doc); + break; + } else { + doc2 = cs.nextDoc(); + } + } + } + if (doc2 > doc) { + break; + } + i++; + } + + if (i == chunkStates.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + + freq = phraseFreq(); + if (freq != 0) { + return docID; + } + } + } + } + + @Override + public int advance(int target) throws IOException { + + // first term + int doc = chunkStates[0].advance(target); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + return docID = DocIdSetIterator.NO_MORE_DOCS; + } + + while(true) { + + // not-first terms + int i = 1; + while(i < chunkStates.length) { + int doc2 = chunkStates[i].docID; + if (doc2 < doc) { + doc2 = chunkStates[i].advance(doc); + } + if (doc2 > doc) { + break; + } + i++; + } + + if (i == chunkStates.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + freq = phraseFreq(); + if (freq != 0) { + return docID; + } + } + + doc = chunkStates[0].nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + return docID = doc; + } + } + } + + @Override + public String toString() { + return "ExactPhraseScorer(" + weight + ")"; + } + + @Override + public float freq() { + return freq; + } + + @Override + public int docID() { + return docID; + } + + @Override + public float score() throws IOException { + return docScorer.score(docID, freq); + } + + private int phraseFreq() throws IOException { + + freq = 0; + + // init chunks + for(int i=0;i cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + counts[posIndex] = 1; + assert gens[posIndex] != gen; + gens[posIndex] = gen; + } + + if (cs.posUpto == cs.curPositions.length) { + end = true; + break; + } + cs.pos = cs.offset + cs.curPositions[cs.posUpto]; + cs.posUpto += cs.posJump; + } + } + + // middle terms + boolean any = true; + for(int t=1;t cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) { + // viable + counts[posIndex]++; + any = true; + } + } + + if (cs.posUpto == cs.curPositions.length) { + end = true; + break; + } + cs.pos = cs.offset + cs.curPositions[cs.posUpto]; + cs.posUpto += cs.posJump; + } + + if (!any) { + break; + } + } + + if (!any) { + // petered out for this chunk + chunkStart += CHUNK; + chunkEnd += CHUNK; + continue; + } + + // last term + + { + final ChunkState cs = chunkStates[endMinus1]; + while(cs.pos < chunkEnd) { + if (cs.pos > cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) { + freq++; + } + } + + if (cs.posUpto == cs.curPositions.length) { + end = true; + break; + } + + cs.pos = cs.offset + cs.curPositions[cs.posUpto]; + cs.posUpto += cs.posJump; + } + } + + chunkStart += CHUNK; + chunkEnd += CHUNK; + } + + return freq; + } +} Index: lucene/core/src/java/org/apache/lucene/search/DirectTermScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/DirectTermScorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/DirectTermScorer.java (working copy) @@ -0,0 +1,121 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.codecs.memory.DirectPostingsFormat.HighFreqDocsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.similarities.Similarity; + +/** Expert: A Scorer for documents matching a + * Term, when DirectPostingsFormat is used. + */ + +final class DirectTermScorer extends Scorer { + private final int[] docIDs; + private final int[] freqs; + private int docID = -1; + private int upto = -1; + private final Similarity.ExactSimScorer docScorer; + + /** + * Construct a TermScorer. + * + * @param weight + * The weight of the Term in the query. + * @param td + * An iterator over the documents matching the Term. + * @param docScorer + * The Similarity.ExactSimScorer implementation + * to be used for score computations. + */ + DirectTermScorer(Weight weight, HighFreqDocsEnum td, Similarity.ExactSimScorer docScorer) throws IOException { + super(weight); + //System.out.println("DTS"); + docIDs = td.getDocIDs(); + freqs = td.getFreqs(); + this.docScorer = docScorer; + } + + @Override + public int docID() { + return docID; + } + + @Override + public float freq() throws IOException { + return freqs[upto]; + } + + /** + * Advances to the next document matching the query.
+ * + * @return the document matching the query or NO_MORE_DOCS if there are no more documents. + */ + @Override + public int nextDoc() throws IOException { + upto++; + if (upto >= docIDs.length) { + docID = NO_MORE_DOCS; + } else { + docID = docIDs[upto]; + } + return docID; + } + + @Override + public float score() throws IOException { + assert docID() != NO_MORE_DOCS; + return docScorer.score(docID, freqs[upto]); + } + + /** + * Advances to the first match beyond the current whose document number is + * greater than or equal to a given target.
+ * The implementation uses {@link DocsEnum#advance(int)}. + * + * @param target + * The target document number. + * @return the matching document or NO_MORE_DOCS if none exist. + */ + @Override + public int advance(int target) throws IOException { + upto++; + if (upto >= docIDs.length) { + return docID = NO_MORE_DOCS; + } + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); + if (index < 0) { + upto = -index - 1; + } else { + upto = index; + } + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } else { + return docID = docIDs[upto]; + } + } + + /** Returns a string representation of this TermScorer. */ + @Override + public String toString() { return "scorer(" + weight + ")"; } + +} Index: lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (revision 1362238) +++ lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40Codec; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds; +import org.apache.lucene.codecs.memory.DirectPostingsFormat; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; @@ -87,9 +88,11 @@ // block via CL: int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); + int lowFreqCutoff = _TestUtil.nextInt(random, 2, 100); add(avoidCodecs, new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock), + new DirectPostingsFormat(minItemsPerBlock, lowFreqCutoff), new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), // add pulsing again with (usually) different parameters new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),