Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java (working copy) @@ -0,0 +1,74 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.ArcAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +class AssertingStage extends Stage { + final ArcAttribute arcAtt; + final CharTermAttribute termAtt; + private int lastFrom; + private int lastTo; + + private final Set allToNodes = new HashSet(); + private final Set allFromNodes = new HashSet(); + + public AssertingStage(Stage prevStage) { + super(prevStage); + arcAtt = prevStage.get(ArcAttribute.class); + termAtt = prevStage.get(CharTermAttribute.class); + } + + @Override + public void reset(Reader reader) { + super.reset(reader); + allToNodes.clear(); + allToNodes.add(0); + allFromNodes.clear(); + } + + @Override + public boolean next() throws IOException { + if (prevStage.next()) { + int from = arcAtt.from(); + int to = arcAtt.to(); + + if (allToNodes.contains(from) && !allFromNodes.contains(from)) { + // OK: from is a "frontier" node (only has arriving + // tokens and no leaving tokens yet) + } else if (nodes.getCanChange(from)) { + // OK: this node was explicitly reserved as still + // having changes + } else { + throw new IllegalStateException("node=" + from + " is frozen, but current token (" + termAtt + ") uses it as from node"); + } + + allFromNodes.add(from); + allToNodes.add(to); + return true; + } else { + return false; + } + } +} Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java (working copy) @@ -0,0 +1,232 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.tokenattributes.ArcAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +public class TestStages extends BaseTokenStreamTestCase { + + /** Like assertAnalyzesTo, but handles a graph: verifies + * the automaton == the union of the expectedStrings. */ + private void assertMatches(Automaton a, String... paths) { + List subs = new ArrayList(); + for(String path : paths) { + String[] tokens = path.split(" "); + State lastState = new State(); + Automaton sub = new Automaton(lastState); + subs.add(sub); + for(int i=0;i= dataLen) { + offset += dataLen; + if (!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) { + break; + } else { + // set final offset + // nocommit -- get charfilter working: + //final int finalOffset = correctOffset(offset); + final int finalOffset = offset; + offsetAtt.setOffset(finalOffset, finalOffset); + return false; + } + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone + final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); + bufferIndex += Character.charCount(c); + + if (isTokenChar(c)) { // if it's a token char + if (length == 0) { // start of token + start = offset + bufferIndex - 1; + } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds + buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer + } + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } + } else if (length > 0) { // at non-Letter w/ chars + break; // return 'em + } + } + + termAtt.setLength(length); + + // nocommit -- get charfilter working: + //offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); + offsetAtt.setOffset(start, start+length); + + int node = nodes.newNode(); + arcAtt.set(lastNode, node); + lastNode = node; + + return true; + } + + protected abstract boolean isTokenChar(int c); + + protected int normalize(int c) { + return c; + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/CharTokenizerStage.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java (working copy) @@ -0,0 +1,101 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +// nocommit can we absorb this into Stage ...? eg first +// Stage does this and all subseqeuent stages hold onto it? +public class NodeTracker { + int nodeUpto; + + public int newNode() { + return nodeUpto++; + } + + public void reset() { + nodeUpto = 0; + numMayChangeNodes = 0; + } + + /** Used by tests */ + boolean anyNodesCanChange() { + return numMayChangeNodes != 0; + } + + private int[] mayChangeNodes = new int[0]; + private int[] mayChangeRC = new int[0]; + private int numMayChangeNodes; + + /** A TokenFilter calls this to reserve the right to + * change a past node. For every call to this method, + * that filter must later call wontChange to "free" the + * reservation. There is no need to reserve either the + * from or the to node of the last token. */ + public void mayChange(int node) { + // nocommit how to assert that node is "live"? like you + // can't suddenly un-freeze an already frozen node... + for(int i=0;i, WeakReference>> attClassImplMap = + WeakIdentityMap.newConcurrentHashMap(false); + + + /** Which Attributes this stage defines */ + private final Map, AttributeImpl> attImpls = new LinkedHashMap, AttributeImpl>(); + private final Map, AttributeImpl> atts = new LinkedHashMap, AttributeImpl>(); + + protected Stage(Stage prevStage) { + this.prevStage = prevStage; + if (prevStage == null) { + this.nodes = new NodeTracker(); + } else { + this.nodes = prevStage.nodes; + } + } + + /** a cache that stores all interfaces for known implementation classes for performance (slow reflection) */ + private static final WeakIdentityMap,LinkedList>>> knownImplClasses = + WeakIdentityMap.newConcurrentHashMap(false); + + static LinkedList>> getAttributeInterfaces(final Class clazz) { + LinkedList>> foundInterfaces = knownImplClasses.get(clazz); + if (foundInterfaces == null) { + // we have the slight chance that another thread may do the same, but who cares? + foundInterfaces = new LinkedList>>(); + // find all interfaces that this attribute instance implements + // and that extend the Attribute interface + Class actClazz = clazz; + do { + for (Class curInterface : actClazz.getInterfaces()) { + if (curInterface != Attribute.class && Attribute.class.isAssignableFrom(curInterface)) { + foundInterfaces.add(new WeakReference>(curInterface.asSubclass(Attribute.class))); + } + } + actClazz = actClazz.getSuperclass(); + } while (actClazz != null); + knownImplClasses.put(clazz, foundInterfaces); + } + return foundInterfaces; + } + + /** Expert: Adds a custom AttributeImpl instance with one or more Attribute interfaces. + *

Please note: It is not guaranteed, that att is added to + * the AttributeSource, because the provided attributes may already exist. + * You should always retrieve the wanted attributes using {@link #getAttribute} after adding + * with this method and cast to your class. + * The recommended way to use custom implementations is using an {@link AttributeFactory}. + *

+ */ + public final void addAttributeImpl(final AttributeImpl att) { + final Class clazz = att.getClass(); + if (attImpls.containsKey(clazz)) return; + + // add all interfaces of this AttributeImpl to the maps + for (WeakReference> curInterfaceRef : getAttributeInterfaces(clazz)) { + final Class curInterface = curInterfaceRef.get(); + assert (curInterface != null) : + "We have a strong reference on the class holding the interfaces, so they should never get evicted"; + // Attribute is a superclass of this interface + if (!atts.containsKey(curInterface)) { + atts.put(curInterface, att); + attImpls.put(clazz, att); + } + } + } + + private static Class getClassForInterface(Class attClass) { + final WeakReference> ref = attClassImplMap.get(attClass); + Class clazz = (ref == null) ? null : ref.get(); + if (clazz == null) { + // we have the slight chance that another thread may do the same, but who cares? + try { + attClassImplMap.put(attClass, + new WeakReference>( + clazz = Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()) + .asSubclass(AttributeImpl.class)) + ); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("Could not find implementing class for " + attClass.getName()); + } + } + return clazz; + } + + /** + * The caller must pass in a Class<? extends Attribute> value. + * This method first checks if an instance of that class is + * already in this AttributeSource and returns it. Otherwise a + * new instance is created, added to this AttributeSource and returned. + */ + protected final A create(Class attClass) { + AttributeImpl attImpl = atts.get(attClass); + if (attImpl == null) { + if (!(attClass.isInterface() && Attribute.class.isAssignableFrom(attClass))) { + throw new IllegalArgumentException( + "addAttribute() only accepts an interface that extends Attribute, but " + + attClass.getName() + " does not fulfil this contract." + ); + } + + try { + attImpl = getClassForInterface(attClass).newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); + } + + addAttributeImpl(attImpl); + return attClass.cast(attImpl); + } else { + throw new IllegalArgumentException(attClass + " was already added"); + } + } + + public final A get(Class attClass) { + AttributeImpl attImpl = atts.get(attClass); + if (attImpl == null) { + if (prevStage != null) { + return prevStage.get(attClass); + } else { + //throw new IllegalArgumentException("This AttributeSource does not have the attribute '" + attClass.getName() + "'."); + return null; + } + } + + return attClass.cast(attImpl); + } + + public abstract boolean next() throws IOException; + + public void reset(Reader reader) { + if (prevStage != null) { + prevStage.reset(reader); + } else { + nodes.reset(); + } + } + + public boolean anyNodesCanChange() { + System.out.println(" nodes=" + nodes); + return nodes.anyNodesCanChange(); + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/Stage.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java (working copy) @@ -0,0 +1,333 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.tokenattributes.ArcAttribute; +import org.apache.lucene.analysis.tokenattributes.ArcAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.analysis.util.CharacterUtils; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit does not do keepOrig + +// nocommit should we ... allow recursing on ourselves? ie +// so sometimes an output could be parsed against an input +// rule? + +/** Synonym filter, that improves on existing one: it can + * handle any graph input, and it can create new positions + * (e.g., dns -> domain name service). */ + +public class SynonymFilterStage extends Stage { + + // We change the term: + private final CharTermAttribute termAttIn; + private final CharTermAttribute termAttOut; + + // We change the to/from: + private final ArcAttribute arcAttIn; + private final ArcAttribute arcAttOut; + + // We set offsets: + private final OffsetAttribute offsetAttIn; + private final OffsetAttribute offsetAttOut; + + // Used only inside addMatch + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + private final LinkedList pendingOutputs = new LinkedList(); + + private final List pending = new ArrayList(); + + private final SynonymMap synonyms; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final boolean ignoreCase; + + private final FST.BytesReader fstReader; + + /** Used to decode outputs */ + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + private static class PartialMatch { + final int fromNode; + final int toNode; + final int startOffset; + final FST.Arc fstNode; + final BytesRef output; + + public PartialMatch(int startOffset, int fromNode, int toNode, FST.Arc fstNode, BytesRef output) { + this.startOffset = startOffset; + this.fromNode = fromNode; + this.toNode = toNode; + this.fstNode = fstNode; + this.output = output; + } + } + + private static class OutputToken { + final String text; + final int startOffset; + final int endOffset; + final int fromNode; + final int toNode; + + public OutputToken(String text, int startOffset, int endOffset, int fromNode, int toNode) { + this.text = text; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.fromNode = fromNode; + this.toNode = toNode; + } + } + + public SynonymFilterStage(Stage prevStage, SynonymMap synonyms, boolean ignoreCase) { + super(prevStage); + termAttIn = prevStage.get(CharTermAttribute.class); + termAttOut = create(CharTermAttribute.class); + arcAttIn = prevStage.get(ArcAttribute.class); + arcAttOut = create(ArcAttribute.class); + offsetAttIn = prevStage.get(OffsetAttribute.class); + offsetAttOut = create(OffsetAttribute.class); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + fst = synonyms.fst; + fstReader = fst.getBytesReader(); + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + scratchArc = new FST.Arc(); + } + + // nocommit we need reset! make test that fails if only + // partial consume + + + /** Extends all pending paths, and starts new paths, + * matching the current token. */ + private void matchOne(PartialMatch match) throws IOException { + BytesRef output; + if (match != null) { + scratchArc.copyFrom(match.fstNode); + output = match.output; + } else { + // Start a new match here: + fst.getFirstArc(scratchArc); + output = fst.outputs.getNoOutput(); + } + + char[] buffer = termAttIn.buffer(); + int bufferLen = termAttIn.length(); + int bufUpto = 0; + while(bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { + return; + } + + // Accum the output + output = fst.outputs.add(output, scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + + // Entire token matched; now see if this is a final + // state: + if (scratchArc.isFinal()) { + // A match! + addMatch(match, fst.outputs.add(output, scratchArc.nextFinalOutput)); + } + + // See if the FST wants to continue matching (ie, needs to + // see the next input token): + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) != null) { + // More matching is possible -- accum the output (if + // any) of the WORD_SEP arc and add a new + // PartialMatch: + int startOffset; + int fromNode; + if (match == null) { + startOffset = offsetAttIn.startOffset(); + fromNode = arcAttIn.from(); + } else { + startOffset = match.startOffset; + fromNode = match.fromNode; + } + + // Reserve the right to use this node as a future fromNode: + nodes.mayChange(fromNode); + //System.out.println(" incr mayChange node=" + fromNode); + //System.out.println(" add pending to node=" + arcAttIn.to()); + pending.add(new PartialMatch(startOffset, + fromNode, + arcAttIn.to(), + new FST.Arc().copyFrom(scratchArc), + fst.outputs.add(output, scratchArc.output))); + } + } + + /** Records a full match; on the next next() we will feed + * output tokens from it. */ + private void addMatch(PartialMatch match, BytesRef bytes) { + //System.out.println(" add full match!"); + + bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); + + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; + if (!keepOrig) { + throw new IllegalArgumentException("this SynonymFilter requires keepOrig = true"); + } + + final int count = code >>> 1; + + for (int outputIDX=0;outputIDX 0: "output contains empty string: " + scratchChars; + + int toNode; + if (chIDX == scratchChars.length) { + toNode = arcAttIn.to(); + } else { + toNode = nodes.newNode(); + lastNode = toNode; + lastStart = 1+chIDX; + } + + // These offsets make sense for "domain name service -> + // DNS", but for "DNS -> domain name service" it's a + // little weird because each of the 3 tokens will have + // the same offsets ... but I don't see how we can do + // any better? + //System.out.println(" add output=" + new String(scratchChars.chars, lastStart, outputLen)); + pendingOutputs.add(new OutputToken(new String(scratchChars.chars, lastStart, outputLen), + match.startOffset, offsetAttIn.endOffset(), + lastNode, toNode)); + } + } + } + } + + /** Update all matches for the current token. */ + private void match() throws IOException { + int from = arcAttIn.from(); + + // First extend any existing partial matches: + int end = pending.size(); + for(int i=0;i { + int position; + final List leaving = new ArrayList(); + + public int compareTo(Node other) { + // No tie break ... I think that's OK? + return position - other.position; + } + } + + private static class Arc { + final Node to; + final String term; + final int startOffset, endOffset; + final boolean deleted; + + public Arc(Node to, String term, int startOffset, int endOffset, boolean deleted) { + this.to = to; + this.term = term; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.deleted = deleted; + } + } + + public StageToTokenStream(Stage prevStage) { + this.prevStage = prevStage; + termAttIn = prevStage.get(CharTermAttribute.class); + termAttOut = addAttribute(CharTermAttribute.class); + posIncAttOut = addAttribute(PositionIncrementAttribute.class); + offsetAttIn = prevStage.get(OffsetAttribute.class); + offsetAttOut = addAttribute(OffsetAttribute.class); + arcAttIn = prevStage.get(ArcAttribute.class); + delAtt = prevStage.get(DeletedAttribute.class); + } + + private Node getNode(Map nodes, int node) { + Node n = nodes.get(node); + if (n == null) { + n = new Node(); + nodes.put(node, n); + frontierNodeCount++; + } + return n; + } + + private void saveToken(Map nodes) { + Node from = nodes.get(arcAttIn.from()); + Node to = getNode(nodes, arcAttIn.to()); + to.position = Math.max(to.position, 1+from.position); + if (from.leaving.isEmpty()) { + frontierNodeCount--; + assert frontierNodeCount >= 0; + } + from.leaving.add(new Arc(to, termAttIn.toString(), offsetAttIn.startOffset(), offsetAttIn.endOffset(), delAtt != null && delAtt.deleted())); + } + + private boolean nextSavedToken() { + while(pendingNodes != null) { + // restore state from pending node/arc: + Node node = pendingNodes[nodeUpto]; + if (node.leaving.isEmpty()) { + assert nodeUpto == pendingNodes.length-1; + pendingPosInc = node.position - lastPosition - 1; + assert pendingPosInc >= 0; + System.out.println(" break: posInc=" + pendingPosInc + " lastPos=" + lastPosition + " vs node.pos=" + node.position); + break; + } + + Arc arc = node.leaving.get(arcUpto); + arcUpto++; + if (arcUpto == node.leaving.size()) { + nodeUpto++; + if (nodeUpto == pendingNodes.length) { + pendingPosInc = node.position - lastPosition; + pendingNodes = null; + } else { + arcUpto = 0; + } + } + + if (!arc.deleted) { + termAttOut.setEmpty(); + termAttOut.append(arc.term); + offsetAttOut.setOffset(arc.startOffset, arc.endOffset); + posIncAttOut.setPositionIncrement(node.position - lastPosition); + System.out.println(" set posInc=" + (node.position - lastPosition)); + // TODO: it'd be trivial to also set PosLengthAtt + // ... but since indexer is immediately after us, and + // indexer ignores pos len, there's no point today + //posLenAttOut.setPositionLength(arc.to.position - node.position); + pendingPosInc = 0; + lastPosition = node.position; + System.out.println(" set lastPos=" + lastPosition); + System.out.println(" return token=" + termAttOut); + return true; + } else { + System.out.println(" skip deleted token"); + } + } + + return false; + } + + // nocommit this can falsely join two clumps into one, eg + // two back-to-back synonyms + + @Override + public final boolean incrementToken() throws IOException { + System.out.println("STS.inc"); + + // This is pointless (we always set all of the attrs we + // export), but tests disagree: + clearAttributes(); + + if (pendingNodes != null) { + // Still iterating through buffered tokens from last + // clump: + if (nextSavedToken()) { + System.out.println(" buffered: " + termAttOut); + return true; + } + System.out.println(" buffered fall through"); + // We can fall through to here, eg if the last + // buffered token(s) were deleted (holes) + } + + if (prevStage.next()) { + if (prevStage.nodes.anyNodesCanChange()) { + System.out.println(" now buffer: " + termAttIn); + Map nodes = new HashMap(); + nodes.put(arcAttIn.from(), new Node()); + frontierNodeCount = 1; + + // Buffer up this "clump" of overlapping tokens + // until it un-clumps itself: + saveToken(nodes); + while (true) { + boolean result = prevStage.next(); + // So long as there are still nodes that can + // change, there must be more tokens (hmm is this + // really true...): + assert result: "Stage.next ended without freezing all nodes"; + saveToken(nodes); + System.out.println(" buffer again: " + termAttIn + "; " + prevStage.anyNodesCanChange() + " " + frontierNodeCount); + if (!prevStage.anyNodesCanChange() && frontierNodeCount == 1) { + System.out.println(" clump done"); + break; + } + } + + // Sort all nodes by position: + pendingNodes = nodes.values().toArray(new Node[nodes.size()]); + ArrayUtil.timSort(pendingNodes); + for(Node node : pendingNodes) { + System.out.println(" node pos=" + node.position + " " + node.leaving.size() + " leaving"); + for(Arc arc : node.leaving) { + System.out.println(" " + arc.term + " to pos=" + arc.to.position); + } + } + nodeUpto = 0; + arcUpto = 0; + lastPosition = -1; + return nextSavedToken(); + + } else { + System.out.println(" pass through"); + // Fast path (pass through): no buffering necessary: + termAttOut.setEmpty(); + termAttOut.append(termAttIn); + offsetAttOut.setOffset(offsetAttIn.startOffset(), + offsetAttIn.endOffset()); + posIncAttOut.setPositionIncrement(1 + pendingPosInc); + pendingPosInc = 0; + return true; + } + } else { + finalEndOffset = offsetAttIn.endOffset(); + } + return false; + } + + @Override + public void end() { + offsetAttOut.setOffset(finalEndOffset, finalEndOffset); + } + + @Override + public void reset() throws IOException { + pendingNodes = null; + } +} Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageToTokenStream.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java (working copy) @@ -0,0 +1,125 @@ +package org.apache.lucene.analysis.stages; + +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.tokenattributes.ArcAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +/** Pass-through stage that builds an Automaton from the + * input tokens it sees. */ + +public class AutomatonStage extends Stage { + + /** We create transition between two adjacent tokens. */ + public static final int POS_SEP = 256; + + /** We add this arc to represent a hole. */ + public static final int HOLE = 257; + + private Automaton automaton; + private Map toStates; + private Map fromStates; + private final ArcAttribute arcAtt; + private final TermToBytesRefAttribute termAtt; + + public AutomatonStage(Stage prevStage) { + super(prevStage); + arcAtt = prevStage.get(ArcAttribute.class); + termAtt = prevStage.get(TermToBytesRefAttribute.class); + } + + @Override + public void reset(Reader reader) { + super.reset(reader); + toStates = new HashMap(); + fromStates = new HashMap(); + State initial = new State(); + // Node 0 is always the start state: + fromStates.put(0, initial); + automaton = new Automaton(initial); + } + + public Automaton getAutomaton() { + return automaton; + } + + private State getToState(int number) { + State state = toStates.get(number); + if (state == null) { + state = new State(); + toStates.put(number, state); + State fromState = fromStates.get(number); + if (fromState != null) { + state.addTransition(new Transition(POS_SEP, fromState)); + } + } + return state; + } + + private State getFromState(int number) { + State state = fromStates.get(number); + if (state == null) { + state = new State(); + fromStates.put(number, state); + State toState = toStates.get(number); + if (toState != null) { + toState.addTransition(new Transition(POS_SEP, state)); + } + } + return state; + } + + @Override + public boolean next() throws IOException { + if (prevStage.next()) { + BytesRef term = termAtt.getBytesRef(); + termAtt.fillBytesRef(); + State lastState = getFromState(arcAtt.from()); + for(int i=0;i