diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java new file mode 100644 index 0000000..9db054a --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/AutomatonStage.java @@ -0,0 +1,144 @@ +package org.apache.lucene.analysis.stages; + +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; + +import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; + +/** + * Pass-through stage that builds an Automaton from the + * input tokens it sees. + */ + +public class AutomatonStage extends Stage { + + /** + * We create transition between two adjacent tokens. + */ + public static final int POS_SEP = 256; + + /** + * We add this arc to represent a hole. + */ + public static final int HOLE = 257; + + private Automaton.Builder builder; + private Automaton automaton; + + private final ArcAttribute arcAtt; + private final TermAttribute termAtt; + private HashMap fromStates; + private HashMap toStates; + private Set hasTransition; + + public AutomatonStage(Stage prevStage) { + super(prevStage); + builder = new Automaton.Builder(); + fromStates = new HashMap<>(); + toStates = new HashMap<>(); + hasTransition = new HashSet<>(); + arcAtt = prevStage.get(ArcAttribute.class); + termAtt = prevStage.get(TermAttribute.class); + } + + @Override + public void reset(Reader reader) { + super.reset(reader); + automaton = null; + builder = new Automaton.Builder(); + fromStates = new HashMap<>(); + toStates = new HashMap<>(); + hasTransition = new HashSet<>(); + } + + public Automaton getAutomaton() { + return automaton; + } + + private Integer getToState(int number) { + Integer toState = toStates.get(number); + if (toState == null) { + toState = builder.createState(); + toStates.put(number, toState); + Integer fromState = fromStates.get(number); + if (fromState != null) { + builder.addTransition(toState, fromState, POS_SEP); + hasTransition.add(toState); + } + } + return toState; + } + + private Integer getFromState(int number) { + Integer fromState = fromStates.get(number); + if (fromState == null) { + fromState = builder.createState(); + fromStates.put(number, fromState); + Integer toState = toStates.get(number); + if (toState != null) { + builder.addTransition(toState, fromState, POS_SEP); + hasTransition.add(toState); + } + } + return fromState; + } + + @Override + public boolean next() throws IOException { + if (prevStage.next()) { + String term = termAtt.get(); + if (term.length() == 0) { + throw new IllegalStateException("cannot handle empty-string term"); + } + + Integer lastState = getFromState(arcAtt.from()); + for (int i = 0; i < term.length(); i++) { + Integer toState; + if (i == term.length() - 1) { + toState = getToState(arcAtt.to()); + } else { + toState = builder.createState(); + } + + builder.addTransition(lastState, toState, term.charAt(i)); + lastState = toState; + } + return true; + } else { + // Assume any to state w/ no transitions: + for (Integer toState : toStates.values()) { + if (!hasTransition.contains(toState)) { + builder.setAccept(toState, true); + } + } + + automaton = Operations.removeDeadStates(Operations.determinize(builder.finish(), DEFAULT_MAX_DETERMINIZED_STATES)); + return false; + } + } +} \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/CharTokenizerStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/CharTokenizerStage.java new file mode 100644 index 0000000..ac7ecbb --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/CharTokenizerStage.java @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.CharacterUtils; +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.OffsetAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; + +public abstract class CharTokenizerStage extends Stage { + private static final int MAX_WORD_LEN = 255; + private static final int IO_BUFFER_SIZE = 4096; + + private Reader input; + private final TermAttribute termAtt; + private final OffsetAttribute offsetAtt; + private final ArcAttribute arcAtt; + + //private final CharacterUtils charUtils; + private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); + + // Where we are in the current chunk we are working on: + private int bufferIndex; + + // Net char offset so far: + private int offset; + + // How many chars currently in the "chunk" we are working on: + private int dataLen; + + private int lastNode; + + private char[] buffer = new char[10]; + + public CharTokenizerStage() { + super(null); + termAtt = create(TermAttribute.class); + offsetAtt = create(OffsetAttribute.class); + arcAtt = create(ArcAttribute.class); + } + + @Override + public void reset(Reader input) { + super.reset(input); + this.input = input; + bufferIndex = 0; + offset = 0; + dataLen = 0; + ioBuffer.reset(); + lastNode = nodes.newNode(); + } + + @Override + public boolean next() throws IOException { + int length = 0; + int start = -1; // this variable is always initialized + int end = -1; + int[] codePointBuffer = new int[1]; + while (true) { + if (bufferIndex >= dataLen) { + offset += dataLen; + CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils + if (ioBuffer.getLength() == 0) { + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) { + break; + } else { + // set final offset + final int finalOffset = correctOffset(offset); + offsetAtt.setOffset(finalOffset, finalOffset); + return false; + } + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); + final int charCount = Character.charCount(c); + bufferIndex += charCount; + + if (isTokenChar(c)) { // if it's a token char + if (length == 0) { // start of token + assert start == -1; + start = offset + bufferIndex - charCount; + end = start; + } else if (length >= buffer.length - 1) { // check if a supplementary could run out of bounds + buffer = ArrayUtil.grow(buffer, 2 + length); // make sure a supplementary fits in the buffer + } + end += charCount; + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } + } else if (length > 0) { // at non-Letter w/ chars + break; // return 'em + } + } + + termAtt.set(new String(buffer, 0, length)); + + offsetAtt.setOffset(correctOffset(start), correctOffset(start + length)); + + int node = nodes.newNode(); + arcAtt.set(lastNode, node); + lastNode = node; + + return true; + } + + protected abstract boolean isTokenChar(int c); + + protected int normalize(int c) { + return c; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/InsertDeletedPunctuationStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/InsertDeletedPunctuationStage.java new file mode 100644 index 0000000..8c642ba --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/InsertDeletedPunctuationStage.java @@ -0,0 +1,173 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.DeletedAttribute; +import org.apache.lucene.analysis.stages.attributes.OffsetAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.FixedBitSet; + +/** + * Uses a CharFilter to detect when punctuation occurs in the + * input in between two tokens, and then as a Stage it will + * re-insert [deleted] tokens when it notices the tokenizer + * had deleted the punctuation. E.g. this can be used to + * prevent synonyms/phrases from matching across punctuation. + */ + +public class InsertDeletedPunctuationStage extends Stage { + + private final DeletedAttribute delAttIn; + private final ArcAttribute arcAttIn; + private final TermAttribute termAttIn; + private final OffsetAttribute offsetAttIn; + + private final ArcAttribute arcAttOut; + private final DeletedAttribute delAttOut; + private final TermAttribute termAttOut; + private final OffsetAttribute offsetAttOut; + + private final String punctToken; + + public InsertDeletedPunctuationStage(Stage prevStage, String punctToken) { + super(prevStage); + this.punctToken = punctToken; + + delAttIn = prevStage.get(DeletedAttribute.class); + offsetAttIn = prevStage.get(OffsetAttribute.class); + arcAttIn = prevStage.get(ArcAttribute.class); + termAttIn = prevStage.get(TermAttribute.class); + + delAttOut = create(DeletedAttribute.class); + offsetAttOut = create(OffsetAttribute.class); + arcAttOut = create(ArcAttribute.class); + termAttOut = create(TermAttribute.class); + } + + private static class FindPunctuationCharFilter extends CharFilter { + FixedBitSet wasPunct = new FixedBitSet(128); + private int pos; + + public FindPunctuationCharFilter(Reader input) { + super(input); + } + + @Override + protected int correct(int offset) { + return offset; + } + + @Override + public int read(char[] buffer, int offset, int length) throws IOException { + int count = input.read(buffer, offset, length); + for (int i = 0; i < count; i++) { + if (isPunct(buffer[offset + i])) { + if (wasPunct.length() <= pos) { + int nextSize = ArrayUtil.oversize(pos + 1, 1); + FixedBitSet nextBits = new FixedBitSet(nextSize); + nextBits.or(wasPunct); + wasPunct = nextBits; + } + wasPunct.set(pos); + } + pos++; + } + + return count; + } + + protected boolean isPunct(char ch) { + // TODO: use proper Character.isXXX apis: + return ch == '.' || ch == ',' || ch == ':' || ch == ';'; + } + } + + @Override + public void reset(Reader input) { + // nocommit this is iffy? if an earlier stage also + // wraps, then, we are different offsets + charFilter = new FindPunctuationCharFilter(input); + super.reset(charFilter); + lastEndOffset = 0; + lastPunct = false; + nodeOffset = 0; + } + + private FindPunctuationCharFilter charFilter; + private boolean lastPunct; + private int lastEndOffset; + private int nodeOffset; + + @Override + public boolean next() throws IOException { + if (lastPunct) { + // Return previously buffered token: + copyToken(); + lastPunct = false; + return true; + } + + if (prevStage.next()) { + int startOffset = offsetAttIn.startOffset(); + assert startOffset <= charFilter.wasPunct.length(); + for (int i = lastEndOffset; i < startOffset; i++) { + if (charFilter.wasPunct.get(i)) { + // The gap between the end of the last token, + // and this token, had punctuation: + lastPunct = true; + break; + } + } + + if (lastPunct) { + // We insert a new node and token here: + + // nocommit this (single int nodeOffset) is too simplistic? + arcAttOut.set(arcAttIn.from() + nodeOffset, arcAttIn.from() + nodeOffset + 1); + delAttOut.set(true); + offsetAttOut.setOffset(lastEndOffset, startOffset); + // nocommit: should we copy over the actual punct chars...? + termAttOut.set(punctToken); + nodeOffset++; + } else { + copyToken(); + } + lastEndOffset = offsetAttIn.endOffset(); + return true; + } else { + return false; + } + } + + private void copyToken() { + if (delAttIn != null) { + delAttOut.set(delAttIn.deleted()); + } else { + delAttOut.set(false); + } + termAttOut.set(termAttIn.get()); + offsetAttOut.setOffset(offsetAttIn.startOffset(), offsetAttIn.endOffset()); + arcAttOut.set(arcAttIn.from() + nodeOffset, arcAttIn.to() + nodeOffset); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/LowerCaseFilterStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/LowerCaseFilterStage.java new file mode 100644 index 0000000..952e3c3 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/LowerCaseFilterStage.java @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.util.Version; + +public class LowerCaseFilterStage extends Stage { + private final TermAttribute termAttOut; + private final TermAttribute termAttIn; + + public LowerCaseFilterStage(Version version, Stage prevStage) { + super(prevStage); + termAttIn = prevStage.get(TermAttribute.class); + termAttOut = create(TermAttribute.class); + } + + @Override + public final boolean next() throws IOException { + if (prevStage.next()) { + final String term = termAttIn.get(); + int length = term.length(); + final char[] bufferOut = new char[length]; + for (int i = 0; i < length; ) { + // nocommit correct? + i += Character.toChars( + Character.toLowerCase( + Character.codePointAt(term, i)), bufferOut, i); + } + termAttOut.set(new String(bufferOut)); + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java new file mode 100644 index 0000000..5b5eb17 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/NodeTracker.java @@ -0,0 +1,106 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; + +// nocommit better name... NodeFreezer? + +// nocommit can we absorb this into Stage ...? eg first +// Stage does this and all subseqeuent stages hold onto it? +public class NodeTracker { + int nodeUpto; + + public int newNode() { + return nodeUpto++; + } + + public void reset() { + nodeUpto = 0; + numMayChangeNodes = 0; + } + + /** + * Used by tests + */ + boolean anyNodesCanChange() { + return numMayChangeNodes != 0; + } + + private int[] mayChangeNodes = new int[0]; + private int[] mayChangeRC = new int[0]; + private int numMayChangeNodes; + + /** + * A TokenFilter calls this to reserve the right to + * change a past node. For every call to this method, + * that filter must later call wontChange to "free" the + * reservation. There is no need to reserve either the + * from or the to node of the last token. + */ + public void mayChange(int node) { + // nocommit how to assert that node is "live"? like you + // can't suddenly un-freeze an already frozen node... + for (int i = 0; i < numMayChangeNodes; i++) { + if (mayChangeNodes[i] == node) { + mayChangeRC[i]++; + return; + } + } + + if (numMayChangeNodes == mayChangeNodes.length) { + mayChangeNodes = ArrayUtil.grow(mayChangeNodes, 1 + numMayChangeNodes); + mayChangeRC = ArrayUtil.grow(mayChangeRC, 1 + numMayChangeNodes); + } + mayChangeNodes[numMayChangeNodes] = node; + mayChangeRC[numMayChangeNodes] = 1; + numMayChangeNodes++; + } + + public void wontChange(int node) { + for (int i = 0; i < numMayChangeNodes; i++) { + if (mayChangeNodes[i] == node) { + mayChangeRC[i]--; + if (mayChangeRC[i] == 0) { + for (int j = i + 1; j < numMayChangeNodes; j++) { + mayChangeNodes[j - 1] = mayChangeNodes[j]; + mayChangeRC[j - 1] = mayChangeRC[j]; + } + numMayChangeNodes--; + } + return; + } + } + + throw new IllegalStateException("extra call to wontChange(" + node + ") vs mayChange(" + node + ")"); + } + + public int getLastNode() { + return nodeUpto - 1; + } + + public boolean getCanChange(int node) { + for (int i = 0; i < numMayChangeNodes; i++) { + if (node == mayChangeNodes[i]) { + return true; + } + } + + return false; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SplitOnDashFilterStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SplitOnDashFilterStage.java new file mode 100644 index 0000000..1b8b054 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SplitOnDashFilterStage.java @@ -0,0 +1,108 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; + +/** + * Simple example of decompounder-as-filter, just dividing + * a word at its dashes and keeping the original. + */ +public class SplitOnDashFilterStage extends Stage { + + // We change the term: + private final TermAttribute termAttIn; + private final TermAttribute termAttOut; + + // We change the to/from: + private final ArcAttribute arcAttIn; + private final ArcAttribute arcAttOut; + + private String[] parts; + private int nextPart; + + public SplitOnDashFilterStage(Stage prevStage) { + super(prevStage); + termAttIn = prevStage.get(TermAttribute.class); + termAttOut = create(TermAttribute.class); + arcAttIn = prevStage.get(ArcAttribute.class); + arcAttOut = create(ArcAttribute.class); + } + + @Override + public void reset(Reader reader) { + super.reset(reader); + parts = null; + } + + @Override + public boolean next() throws IOException { + if (parts != null) { + + termAttOut.set(parts[nextPart]); + int from; + if (nextPart == 0) { + from = arcAttIn.from(); + } else { + from = arcAttOut.to(); + } + int to; + + if (nextPart == 1) { + // Clear our reservation as we output current token: + nodes.wontChange(arcAttIn.from()); + } + + if (nextPart == parts.length - 1) { + to = arcAttIn.to(); + parts = null; + } else { + to = nodes.newNode(); + nextPart++; + } + arcAttOut.set(from, to); + + return true; + } + + if (prevStage.next()) { + + // nocommit copyTo? + termAttOut.set(termAttIn.get()); + arcAttOut.set(arcAttIn.from(), arcAttIn.to()); + + parts = termAttIn.toString().split("-"); + if (parts.length == 1) { + parts = null; + } else { + // Reserve right to change this node: + nodes.mayChange(arcAttIn.from()); + nextPart = 0; + } + + // nocommit: what to do about offset... + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/Stage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/Stage.java new file mode 100644 index 0000000..0e91ce6 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/Stage.java @@ -0,0 +1,111 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.util.Attribute; + +/** + * Represents one stage of an analysis pipeline. + */ +public abstract class Stage { + protected final Stage prevStage; + + // Single NodeTracker instance is shared across all + // stages: + protected final NodeTracker nodes; + + /** + * Which Attributes this stage defines + */ + private final Map, Attribute> atts = new LinkedHashMap, Attribute>(); + + protected Stage(Stage prevStage) { + this.prevStage = prevStage; + if (prevStage == null) { + this.nodes = new NodeTracker(); + } else { + this.nodes = prevStage.nodes; + } + } + + protected final A create(Class attClass) { + Attribute att = atts.get(attClass); + if (att == null) { + try { + att = attClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); + } + + atts.put(attClass, att); + return attClass.cast(att); + } else { + throw new IllegalArgumentException(attClass + " was already added"); + } + } + + public final A get(Class attClass) { + Attribute attImpl = atts.get(attClass); + if (attImpl == null) { + if (prevStage != null) { + return prevStage.get(attClass); + } else { + return null; + } + } + + return attClass.cast(attImpl); + } + + public abstract boolean next() throws IOException; + + // Only set for first Stage in a chain: + private Reader input; + + public void reset(Reader reader) { + if (prevStage != null) { + prevStage.reset(reader); + } else { + nodes.reset(); + input = reader; + } + } + + protected final int correctOffset(int currentOff) { + // nocommit should we strongly type this (like + // Tokenizer/TokenFilter today)? + if (input == null) { + throw new IllegalStateException("only first Stage can call correctOffset"); + } + return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff; + } + + // nocommit should we impl close()? why? + + public boolean anyNodesCanChange() { + return nodes.anyNodesCanChange(); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageAnalyzer.java new file mode 100644 index 0000000..08bf87d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageAnalyzer.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; + +/** + * An analyzer based on Stage analysis components. + */ +public abstract class StageAnalyzer extends Analyzer { + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final Stage stage = getStages(); + final StageToTokenStream ts = new StageToTokenStream(stage); + return new TokenStreamComponents(null, ts) { + @Override + protected void setReader(final Reader reader) { + stage.reset(reader); + // nocommit confusing + ts.resetCalled = false; + if (ts.closeCalled == false) { + throw new IllegalStateException("you forgot to call close"); + } + ts.closeCalled = false; + } + }; + } + + protected abstract Stage getStages(); +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageToTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageToTokenStream.java new file mode 100644 index 0000000..908f768 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StageToTokenStream.java @@ -0,0 +1,290 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.DeletedAttribute; +import org.apache.lucene.analysis.stages.attributes.OffsetAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.util.ArrayUtil; + +// TODO +// - is there an adversary here? that can cause +// indefinite buffering? + +// nocommit ToTokenizer instead? TokenFilter? + +// nocommit make this more generic? ie not just atts for +// current indexer ... eg use captureState/restoreState to +// pass through any custom atts too + +/** + * This is a compatibility class, to map the new {@link Stage} API to + * the legacy {@link TokenStream} API currently used/required by + * consumers like {@link IndexWriter} and query parsers. It takes + * a {@link Stage} as input and produces a {@link TokenStream} as + * output. This is not general purpose: it currently only sets + * the attributes that the (core, no custom indexing chain) indexer + * requires. + */ + +public class StageToTokenStream extends TokenStream { + + // nocommit: cutover to the approach from SausageGraphFilter + + private final Stage stage; + private final DeletedAttribute delAtt; + private final TermAttribute termAttIn; + private final ArcAttribute arcAttIn; + private final OffsetAttribute offsetAttIn; + + private final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAttOut; + private final org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute posIncAttOut; + private final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAttOut; + + protected boolean resetCalled; + protected boolean closeCalled = true; + + // Non-null when we are iterating through previously + // buffered tokens: + private Node[] pendingNodes; + private int nodeUpto; + private int arcUpto; + private int lastPosition; + private int pendingPosInc; + + private int finalEndOffset; + + // How many nodes in the current clump have no leaving arcs: + private int frontierNodeCount; + + /** + * Holds a buffered node + */ + private static class Node implements Comparable { + int position; + final List leaving = new ArrayList(); + + public int compareTo(Node other) { + // No tie break ... I think that's OK? + return position - other.position; + } + } + + private static class Arc { + final Node to; + final String term; + final int startOffset, endOffset; + final boolean deleted; + + public Arc(Node to, String term, int startOffset, int endOffset, boolean deleted) { + this.to = to; + this.term = term; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.deleted = deleted; + } + } + + public StageToTokenStream(Stage stage) { + this.stage = stage; + termAttIn = stage.get(TermAttribute.class); + termAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + posIncAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); + offsetAttIn = stage.get(OffsetAttribute.class); + offsetAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); + arcAttIn = stage.get(ArcAttribute.class); + delAtt = stage.get(DeletedAttribute.class); + } + + private Node getNode(Map nodes, int node) { + Node n = nodes.get(node); + if (n == null) { + n = new Node(); + nodes.put(node, n); + frontierNodeCount++; + } + return n; + } + + private void saveToken(Map nodes) { + Node from = nodes.get(arcAttIn.from()); + Node to = getNode(nodes, arcAttIn.to()); + to.position = Math.max(to.position, 1 + from.position); + if (from.leaving.isEmpty()) { + frontierNodeCount--; + assert frontierNodeCount >= 0; + } + from.leaving.add(new Arc(to, termAttIn.toString(), offsetAttIn.startOffset(), offsetAttIn.endOffset(), delAtt != null && delAtt.deleted())); + } + + private boolean nextSavedToken() { + while (pendingNodes != null) { + // restore state from pending node/arc: + Node node = pendingNodes[nodeUpto]; + if (node.leaving.isEmpty()) { + assert nodeUpto == pendingNodes.length - 1; + pendingPosInc = node.position - lastPosition - 1; + assert pendingPosInc >= 0; + System.out.println(" break: posInc=" + pendingPosInc + " lastPos=" + lastPosition + " vs node.pos=" + node.position); + break; + } + + Arc arc = node.leaving.get(arcUpto); + arcUpto++; + if (arcUpto == node.leaving.size()) { + nodeUpto++; + if (nodeUpto == pendingNodes.length) { + pendingPosInc = node.position - lastPosition; + pendingNodes = null; + } else { + arcUpto = 0; + } + } + + if (!arc.deleted) { + termAttOut.setEmpty(); + termAttOut.append(arc.term); + offsetAttOut.setOffset(arc.startOffset, arc.endOffset); + posIncAttOut.setPositionIncrement(node.position - lastPosition); + System.out.println(" set posInc=" + (node.position - lastPosition)); + // TODO: it'd be trivial to also set PosLengthAtt + // ... but since indexer is immediately after us, and + // indexer ignores pos len, there's no point today + //posLenAttOut.setPositionLength(arc.to.position - node.position); + pendingPosInc = 0; + lastPosition = node.position; + System.out.println(" set lastPos=" + lastPosition); + System.out.println(" return token=" + termAttOut); + return true; + } else { + System.out.println(" skip deleted token"); + } + } + + return false; + } + + // nocommit this can falsely join two clumps into one, eg + // two back-to-back synonyms + + @Override + public final boolean incrementToken() throws IOException { + System.out.println("STS.inc"); + + if (resetCalled == false) { + throw new IllegalStateException("call reset first"); + } + + // This is pointless (we always set all of the attrs we + // export), but tests disagree: + clearAttributes(); + + if (pendingNodes != null) { + // Still iterating through buffered tokens from last + // clump: + if (nextSavedToken()) { + System.out.println(" buffered: " + termAttOut); + return true; + } + System.out.println(" buffered fall through"); + // We can fall through to here, eg if the last + // buffered token(s) were deleted (holes) + } + + if (stage.next()) { + if (stage.nodes.anyNodesCanChange()) { + System.out.println(" now buffer: " + termAttIn); + Map nodes = new HashMap(); + nodes.put(arcAttIn.from(), new Node()); + frontierNodeCount = 1; + + // Buffer up this "clump" of overlapping tokens + // until it un-clumps itself: + saveToken(nodes); + while (true) { + boolean result = stage.next(); + // So long as there are still nodes that can + // change, there must be more tokens (hmm is this + // really true...): + assert result : "Stage.next ended without freezing all nodes"; + saveToken(nodes); + System.out.println(" buffer again: " + termAttIn + "; " + stage.anyNodesCanChange() + " " + frontierNodeCount); + if (!stage.anyNodesCanChange() && frontierNodeCount == 1) { + System.out.println(" clump done"); + break; + } + } + + // Sort all nodes by position: + pendingNodes = nodes.values().toArray(new Node[nodes.size()]); + ArrayUtil.timSort(pendingNodes); + for (Node node : pendingNodes) { + System.out.println(" node pos=" + node.position + " " + node.leaving.size() + " leaving"); + for (Arc arc : node.leaving) { + System.out.println(" " + arc.term + " to pos=" + arc.to.position); + } + } + nodeUpto = 0; + arcUpto = 0; + lastPosition = -1; + return nextSavedToken(); + + } else { + System.out.println(" pass through"); + // Fast path (pass through): no buffering necessary: + termAttOut.setEmpty(); + termAttOut.append(termAttIn.get()); + offsetAttOut.setOffset(offsetAttIn.startOffset(), + offsetAttIn.endOffset()); + posIncAttOut.setPositionIncrement(1 + pendingPosInc); + pendingPosInc = 0; + return true; + } + } else { + finalEndOffset = offsetAttIn.endOffset(); + } + return false; + } + + @Override + public void end() throws IOException { + super.end(); + offsetAttOut.setOffset(finalEndOffset, finalEndOffset); + } + + @Override + public void reset() throws IOException { + super.reset(); + pendingNodes = null; + resetCalled = true; + } + + @Override + public void close() throws IOException { + closeCalled = true; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StopFilterStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StopFilterStage.java new file mode 100644 index 0000000..831fd8b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/StopFilterStage.java @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.stages.attributes.DeletedAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; + +/** + * Marks stop words as deleted + */ +public class StopFilterStage extends Stage { + + private final CharArraySet stopWords; + private final TermAttribute termAtt; + private final DeletedAttribute delAttIn; + private final DeletedAttribute delAttOut; + + public StopFilterStage(Stage prevStage, CharArraySet stopWords) { + super(prevStage); + this.stopWords = stopWords; + termAtt = prevStage.get(TermAttribute.class); + delAttIn = prevStage.get(DeletedAttribute.class); + delAttOut = create(DeletedAttribute.class); + } + + @Override + public boolean next() throws IOException { + if (prevStage.next()) { + if ((delAttIn != null && delAttIn.deleted()) || stopWords.contains(termAtt.get())) { + delAttOut.set(true); + } else { + delAttOut.set(false); + } + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java new file mode 100644 index 0000000..53e7089 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java @@ -0,0 +1,344 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; + +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.OffsetAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit does not do keepOrig + +// nocommit should we ... allow recursing on ourselves? ie +// so sometimes an output could be parsed against an input +// rule? + +/** + * Synonym filter, that improves on existing one: it can + * handle any graph input, and it can create new positions + * (e.g., dns -> domain name service). + */ + +public class SynonymFilterStage extends Stage { + + // We change the term: + private final TermAttribute termAttIn; + private final TermAttribute termAttOut; + + // We change the to/from: + private final ArcAttribute arcAttIn; + private final ArcAttribute arcAttOut; + + // We set offsets: + private final OffsetAttribute offsetAttIn; + private final OffsetAttribute offsetAttOut; + + // Used only inside addMatch + private final BytesRef scratchBytes = new BytesRef(); + + private final LinkedList pendingOutputs = new LinkedList(); + + private final List pending = new ArrayList(); + + private final SynonymMap synonyms; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final boolean ignoreCase; + + private final FST.BytesReader fstReader; + + /** + * Used to decode outputs + */ + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + private static class PartialMatch { + final int fromNode; + final int toNode; + final int startOffset; + final FST.Arc fstNode; + final BytesRef output; + + public PartialMatch(int startOffset, int fromNode, int toNode, FST.Arc fstNode, BytesRef output) { + this.startOffset = startOffset; + this.fromNode = fromNode; + this.toNode = toNode; + this.fstNode = fstNode; + this.output = output; + } + } + + private static class OutputToken { + final String text; + final int startOffset; + final int endOffset; + final int fromNode; + final int toNode; + + public OutputToken(String text, int startOffset, int endOffset, int fromNode, int toNode) { + this.text = text; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.fromNode = fromNode; + this.toNode = toNode; + } + } + + public SynonymFilterStage(Stage prevStage, SynonymMap synonyms, boolean ignoreCase) { + super(prevStage); + termAttIn = prevStage.get(TermAttribute.class); + termAttOut = create(TermAttribute.class); + arcAttIn = prevStage.get(ArcAttribute.class); + arcAttOut = create(ArcAttribute.class); + offsetAttIn = prevStage.get(OffsetAttribute.class); + offsetAttOut = create(OffsetAttribute.class); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + fst = synonyms.fst; + fstReader = fst.getBytesReader(); + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + scratchArc = new FST.Arc(); + } + + // nocommit we need reset! make test that fails if only + // partial consume + @Override + public void reset(Reader reader) { + super.reset(reader); + pending.clear(); + pendingOutputs.clear(); + } + + /** + * Extends all pending paths, and starts new paths, + * matching the current token. + */ + private void matchOne(PartialMatch match) throws IOException { + BytesRef output; + if (match != null) { + scratchArc.copyFrom(match.fstNode); + output = match.output; + } else { + // Start a new match here: + fst.getFirstArc(scratchArc); + output = fst.outputs.getNoOutput(); + } + + int bufferLen = termAttIn.get().length(); + int bufUpto = 0; + while (bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(termAttIn.get(), bufUpto); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { + return; + } + + // Accum the output + output = fst.outputs.add(output, scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + + // Entire token matched; now see if this is a final + // state: + if (scratchArc.isFinal()) { + // A match! + addMatch(match, fst.outputs.add(output, scratchArc.nextFinalOutput)); + } + + // See if the FST wants to continue matching (ie, needs to + // see the next input token): + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) != null) { + // More matching is possible -- accum the output (if + // any) of the WORD_SEP arc and add a new + // PartialMatch: + int startOffset; + int fromNode; + if (match == null) { + startOffset = offsetAttIn.startOffset(); + fromNode = arcAttIn.from(); + } else { + startOffset = match.startOffset; + fromNode = match.fromNode; + } + + // Reserve the right to use this node as a future fromNode: + nodes.mayChange(fromNode); + //System.out.println(" incr mayChange node=" + fromNode); + //System.out.println(" add pending to node=" + arcAttIn.to()); + pending.add(new PartialMatch(startOffset, + fromNode, + arcAttIn.to(), + new FST.Arc().copyFrom(scratchArc), + fst.outputs.add(output, scratchArc.output))); + } + } + + /** + * Records a full match; on the next next() we will feed + * output tokens from it. + */ + private void addMatch(PartialMatch match, BytesRef bytes) { + //System.out.println(" add full match!"); + + bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); + + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; + if (!keepOrig) { + throw new IllegalArgumentException("this SynonymFilter requires keepOrig = true"); + } + + final int count = code >>> 1; + for (int outputIDX = 0; outputIDX < count; outputIDX++) { + synonyms.words.get(bytesReader.readVInt(), scratchBytes); + char[] chars = ArrayUtil.grow(new char[0], scratchBytes.length); + int len = UnicodeUtil.UTF8toUTF16(scratchBytes, chars); + + int lastStart = 0; + int lastNode = match.fromNode; + + for (int chIDX = 0; chIDX <= len; chIDX++) { + if (chIDX == len || chars[chIDX] == SynonymMap.WORD_SEPARATOR) { + final int outputLen = chIDX - lastStart; + + // Caller is not allowed to have empty string in + // the output: + assert outputLen > 0 : "output contains empty string: " + Objects.toString(chars); + + int toNode; + if (chIDX == len) { + toNode = arcAttIn.to(); + } else { + toNode = nodes.newNode(); + lastNode = toNode; + lastStart = 1 + chIDX; + } + + // These offsets make sense for "domain name service -> + // DNS", but for "DNS -> domain name service" it's a + // little weird because each of the 3 tokens will have + // the same offsets ... but I don't see how we can do + // any better? + //System.out.println(" add output=" + new String(scratchChars.chars, lastStart, outputLen)); + pendingOutputs.add(new OutputToken(new String(chars, lastStart, outputLen), + match.startOffset, offsetAttIn.endOffset(), + lastNode, toNode)); + } + } + } + } + + /** + * Update all matches for the current token. + */ + private void match() throws IOException { + int from = arcAttIn.from(); + + // First extend any existing partial matches: + int end = pending.size(); + for (int i = 0; i < end; i++) { + PartialMatch match = pending.get(i); + //System.out.println(" try to extend match ending @ node=" + match.toNode + " vs from=" + from); + if (match.toNode == from) { + matchOne(match); + } + } + + // Then start any new matches: + matchOne(null); + + // Prune any dead partial matches: + int upto = 0; + for (int i = 0; i < pending.size(); i++) { + PartialMatch match = pending.get(i); + int to = match.toNode; + if (to != arcAttIn.from() && to != arcAttIn.to() && !nodes.getCanChange(to)) { + // Prune this path + //System.out.println(" prune path @ node=" + match.toNode); + //System.out.println(" decr node=" + match.fromNode); + nodes.wontChange(match.fromNode); + } else { + if (upto < i) { + pending.set(upto, pending.get(i)); + } + upto++; + } + } + + pending.subList(upto, pending.size()).clear(); + } + + private boolean insertOutputToken() { + OutputToken token = pendingOutputs.pollFirst(); + if (token != null) { + //System.out.println(" insert output token! text=" + token.text + " offset=" + token.startOffset + "/" + token.endOffset); + // We still have outputs pending from previous match: + termAttOut.set(token.text); + offsetAttOut.setOffset(token.startOffset, token.endOffset); + arcAttOut.set(token.fromNode, token.toNode); + return true; + } else { + return false; + } + } + + public boolean next() throws IOException { + //System.out.println("\nSYN.next"); + if (insertOutputToken()) { + return true; + } + //System.out.println(" input.next()"); + if (prevStage.next()) { + // Extend matches for this new token: + //System.out.println(" got token=" + termAttIn + " from=" + arcAttIn.from() + " to=" + arcAttIn.to()); + match(); + + // nocommit copy? + termAttOut.set(termAttIn.get()); + + offsetAttOut.setOffset(offsetAttIn.startOffset(), + offsetAttIn.endOffset()); + arcAttOut.set(arcAttIn.from(), + arcAttIn.to()); + return true; + } else { + // Prune any remaining partial matches: + for (int i = 0; i < pending.size(); i++) { + nodes.wontChange(pending.get(i).fromNode); + } + pending.clear(); + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/WhitespaceTokenizerStage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/WhitespaceTokenizerStage.java new file mode 100644 index 0000000..9005be9 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/WhitespaceTokenizerStage.java @@ -0,0 +1,25 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class WhitespaceTokenizerStage extends CharTokenizerStage { + @Override + protected boolean isTokenChar(int c) { + return !Character.isWhitespace(c); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/ArcAttribute.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/ArcAttribute.java new file mode 100644 index 0000000..cb0ddd0 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/ArcAttribute.java @@ -0,0 +1,72 @@ +package org.apache.lucene.analysis.stages.attributes; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +public class ArcAttribute implements Attribute, Cloneable { + private int from; + private int to; + + public ArcAttribute() { + } + + public int from() { + return from; + } + + public int to() { + return to; + } + + public void set(int from, int to) { + this.from = from; + this.to = to; + } + + public void clear() { + from = 0; + to = 0; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof ArcAttribute) { + ArcAttribute o = (ArcAttribute) other; + return o.from == from && o.to == to; + } + + return false; + } + + @Override + public int hashCode() { + int code = from; + code = code * 31 + to; + return code; + } + + public void copyTo(Attribute target) { + ArcAttribute t = (ArcAttribute) target; + t.set(from, to); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/DeletedAttribute.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/DeletedAttribute.java new file mode 100644 index 0000000..db13570 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/DeletedAttribute.java @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.stages.attributes; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +public class DeletedAttribute implements Attribute, Cloneable { + private boolean deleted; + + public DeletedAttribute() { + } + + public boolean deleted() { + return deleted; + } + + public void set(boolean deleted) { + this.deleted = deleted; + } + + public void clear() { + deleted = false; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof DeletedAttribute) { + DeletedAttribute o = (DeletedAttribute) other; + return o.deleted == deleted; + } + + return false; + } + + @Override + public int hashCode() { + return deleted ? 31 : 57; + } + + public void copyTo(Attribute target) { + DeletedAttribute t = (DeletedAttribute) target; + t.set(deleted); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/OffsetAttribute.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/OffsetAttribute.java new file mode 100644 index 0000000..6b138e9 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/OffsetAttribute.java @@ -0,0 +1,89 @@ +package org.apache.lucene.analysis.stages.attributes; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +public class OffsetAttribute implements Attribute, Cloneable { + private int startOffset; + private int endOffset; + + /** + * Initialize this attribute with startOffset and endOffset of 0. + */ + public OffsetAttribute() { + } + + public int startOffset() { + return startOffset; + } + + public void setOffset(int startOffset, int endOffset) { + + // TODO: we could assert that this is set-once, ie, + // current values are -1? Very few token filters should + // change offsets once set by the tokenizer... and + // tokenizer should call clearAtts before re-using + // OffsetAtt + + if (startOffset < 0 || endOffset < startOffset) { + throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + + "startOffset=" + startOffset + ",endOffset=" + endOffset); + } + + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + public int endOffset() { + return endOffset; + } + + public void clear() { + // TODO: we could use -1 as default here? Then we can + // assert in setOffset... + startOffset = 0; + endOffset = 0; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof OffsetAttribute) { + OffsetAttribute o = (OffsetAttribute) other; + return o.startOffset == startOffset && o.endOffset == endOffset; + } + + return false; + } + + @Override + public int hashCode() { + int code = startOffset; + code = code * 31 + endOffset; + return code; + } + + public void copyTo(Attribute target) { + OffsetAttribute t = (OffsetAttribute) target; + t.setOffset(startOffset, endOffset); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/TermAttribute.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/TermAttribute.java new file mode 100644 index 0000000..5431a6d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/attributes/TermAttribute.java @@ -0,0 +1,38 @@ +package org.apache.lucene.analysis.stages.attributes; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +// TODO: CharSequence again? +public class TermAttribute implements Attribute { + private String term; + + public void set(String term) { + this.term = term; + } + + public String get() { + return term; + } + + @Override + public String toString() { + return term; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java new file mode 100644 index 0000000..cf3f6ed --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/AssertingStage.java @@ -0,0 +1,74 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.stages.attributes.TermAttribute; + +class AssertingStage extends Stage { + final ArcAttribute arcAtt; + final TermAttribute termAtt; + private int lastFrom; + private int lastTo; + + private final Set allToNodes = new HashSet(); + private final Set allFromNodes = new HashSet(); + + public AssertingStage(Stage prevStage) { + super(prevStage); + arcAtt = prevStage.get(ArcAttribute.class); + termAtt = prevStage.get(TermAttribute.class); + } + + @Override + public void reset(Reader reader) { + super.reset(reader); + allToNodes.clear(); + allToNodes.add(0); + allFromNodes.clear(); + } + + @Override + public boolean next() throws IOException { + if (prevStage.next()) { + int from = arcAtt.from(); + int to = arcAtt.to(); + + if (allToNodes.contains(from) && !allFromNodes.contains(from)) { + // OK: from is a "frontier" node (only has arriving + // tokens and no leaving tokens yet) + } else if (nodes.getCanChange(from)) { + // OK: this node was explicitly reserved as still + // having changes + } else { + throw new IllegalStateException("node=" + from + " is frozen, but current token (" + termAtt + ") uses it as from node"); + } + + allFromNodes.add(from); + allToNodes.add(to); + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java new file mode 100644 index 0000000..2e8e35b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/stages/TestStages.java @@ -0,0 +1,313 @@ +package org.apache.lucene.analysis.stages; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.stages.attributes.ArcAttribute; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.Operations; + +import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; + +public class TestStages extends BaseTokenStreamTestCase { + + /** + * Like assertAnalyzesTo, but handles a graph: verifies + * the automaton == the union of the expectedStrings. + */ + private void assertMatches(Automaton a, String... paths) { + List subs = new ArrayList(); + for (String path : paths) { + String[] tokens = path.split(" "); + + Automaton.Builder sub = new Automaton.Builder(); + Integer lastState = sub.createState(); + + for (int i = 0; i < tokens.length; i++) { + String token = tokens[i]; + BytesRef br = new BytesRef(token); + for (int j = 0; j < br.length; j++) { + Integer state = sub.createState(); + sub.addTransition(lastState, state, br.bytes[br.offset + j]); + lastState = state; + } + if (i < tokens.length - 1) { + Integer state = sub.createState(); + sub.addTransition(lastState, state, AutomatonStage.POS_SEP); + lastState = state; + } + } + + sub.setAccept(lastState, true); + subs.add(sub.finish()); + } + + Automaton expected = Operations.removeDeadStates(Operations.determinize(Operations.union(subs), DEFAULT_MAX_DETERMINIZED_STATES)); + if (!Operations.sameLanguage(expected, a)) { + System.out.println("expected:\n" + MinimizationOperations.minimize(expected, DEFAULT_MAX_DETERMINIZED_STATES).toDot()); + System.out.println("actual:\n" + MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES).toDot()); + throw new AssertionError("languages differ"); + } + } + + /** + * Runs the text through the analyzer and verifies the + * resulting automaton == union of the expectedStrings. + */ + private void assertMatches(String text, Stage end, String... expectedStrings) throws IOException { + AutomatonStage a = new AutomatonStage(new AssertingStage(end)); + CharTermAttribute termAtt = a.get(CharTermAttribute.class); + ArcAttribute arcAtt = a.get(ArcAttribute.class); + for (int i = 0; i < 2; i++) { + a.reset(new StringReader(text)); + while (a.next()) { + //System.out.println("token=" + termAtt + " from=" + arcAtt.from() + " to=" + arcAtt.to()); + } + assertMatches(a.getAutomaton(), expectedStrings); + assertFalse(a.anyNodesCanChange()); + } + } + + public void testSimple() throws Exception { + assertMatches("This is a test", + new LowerCaseFilterStage(Version.LATEST, new WhitespaceTokenizerStage()), + "this is a test"); + } + + public void testSplitOnDash() throws Exception { + assertMatches("The drill-down-test works", + new SplitOnDashFilterStage(new LowerCaseFilterStage(Version.LATEST, new WhitespaceTokenizerStage())), + "the drill-down-test works", + "the drill down test works"); + } + + private void add(SynonymMap.Builder b, String input, String output) { + if (VERBOSE) { + System.out.println(" add input=" + input + " output=" + output); + } + CharsRefBuilder inputCharsRef = new CharsRefBuilder(); + SynonymMap.Builder.join(input.split(" +"), inputCharsRef); + + CharsRefBuilder outputCharsRef = new CharsRefBuilder(); + SynonymMap.Builder.join(output.split(" +"), outputCharsRef); + + b.add(inputCharsRef.get(), outputCharsRef.get(), true); + } + + public void testSyn() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + assertMatches("a b c foo", + new SynonymFilterStage(new WhitespaceTokenizerStage(), b.build(), true), + "a b c foo", "x foo"); + } + + public void testSyn2() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + assertMatches("a b c", + new SynonymFilterStage(new WhitespaceTokenizerStage(), b.build(), true), + "a b c", "x"); + } + + public void testSynAfterDecompound() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + // Decompounder splits a-b into and b, and then + // SynFilter runs after that and sees "a b c" match: + assertMatches("a-b c foo", + new SynonymFilterStage(new SplitOnDashFilterStage(new WhitespaceTokenizerStage()), b.build(), true), + "a b c foo", "a-b c foo", "x foo"); + } + + // No buffering needed: + public void testBasicStageAnalyzer() throws Exception { + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new LowerCaseFilterStage(Version.LATEST, new WhitespaceTokenizerStage()); + } + }; + + assertAnalyzesTo(a, "This is a test", + new String[]{"this", "is", "a", "test"}, + new int[]{0, 5, 8, 10}, + new int[]{4, 7, 9, 14}, + null, + new int[]{1, 1, 1, 1}); + } + + // Buffering needed: + public void testSplitOnDashStageAnalyzer() throws Exception { + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new SplitOnDashFilterStage(new LowerCaseFilterStage(Version.LATEST, new WhitespaceTokenizerStage())); + } + }; + + assertAnalyzesTo(a, "The drill-down-test works", + new String[]{"the", "drill-down-test", "drill", "down", "test", "works"}, + new int[]{0, 4, 4, 4, 4, 20}, + new int[]{3, 19, 19, 19, 19, 25}, + null, + new int[]{1, 1, 0, 1, 1, 1}); + } + + // Buffering needed: + public void testSynStageAnalyzer() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + final SynonymMap map = b.build(); + + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new SynonymFilterStage(new WhitespaceTokenizerStage(), map, true); + } + }; + + assertAnalyzesTo(a, "a b c foo", + new String[]{"a", "x", "b", "c", "foo"}, + new int[]{0, 0, 2, 4, 6}, + new int[]{1, 5, 3, 5, 9}, + null, + new int[]{1, 0, 1, 1, 1}); + } + + // Buffering needed: + public void testSynAfterDecompoundStageAnalyzer() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + final SynonymMap map = b.build(); + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new SynonymFilterStage(new SplitOnDashFilterStage(new WhitespaceTokenizerStage()), map, true); + } + }; + + // Decompounder splits a-b into and b, and then + // SynFilter runs after that and sees "a b c" match: + assertAnalyzesTo(a, "a-b c foo", + new String[]{"a-b", "a", "x", "b", "c", "foo"}, + new int[]{0, 0, 0, 0, 4, 6}, + new int[]{3, 3, 5, 3, 5, 9}, + null, + new int[]{1, 0, 0, 1, 1, 1}); + } + + public void testStopFilterStageAnalyzer() throws Exception { + final CharArraySet stopWords = new CharArraySet(1, false); + stopWords.add("the"); + + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new StopFilterStage(new SplitOnDashFilterStage(new WhitespaceTokenizerStage()), stopWords); + } + }; + + // Decompounder splits a-b into and b, and then + // SynFilter runs after that and sees "a b c" match: + assertAnalyzesTo(a, "the-dog barks", + new String[]{"the-dog", "dog", "barks"}, + new int[]{0, 0, 8}, + new int[]{7, 7, 13}, + null, + new int[]{1, 1, 1}); + } + + public class SillyCharFilter extends CharFilter { + public SillyCharFilter(Reader input) { + super(input); + } + + @Override + public int read(char[] buffer, int offset, int length) throws IOException { + return input.read(buffer, offset, length); + } + + @Override + protected int correct(int currentOff) { + return currentOff + 1; + } + } + + public void testCharFilter() throws Exception { + Analyzer a = new StageAnalyzer() { + @Override + protected Stage getStages() { + return new LowerCaseFilterStage(Version.LATEST, new WhitespaceTokenizerStage()); + } + + @Override + protected Reader initReader(String fieldName, Reader input) { + return new SillyCharFilter(input); + } + }; + + // Same as testBasic, but all offsets + // (incl. finalOffset) have been "corrected" by +1: + assertTokenStreamContents(a.tokenStream("dummy", new StringReader("This is a test")), + new String[]{"this", "is", "a", "test"}, + new int[]{1, 6, 9, 11}, + new int[]{5, 8, 10, 15}, + null, + new int[]{1, 1, 1, 1}, + null, + 15); + } + + static class WhitespaceOrPunctTokenizerStage extends CharTokenizerStage { + @Override + protected boolean isTokenChar(int c) { + return !Character.isWhitespace(c) && c != ','; + } + } + + public void testInsertDeletedPunctuation() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a b c", "x"); + + Stage s = new SynonymFilterStage(new InsertDeletedPunctuationStage(new LowerCaseFilterStage(Version.LATEST, new WhitespaceOrPunctTokenizerStage()), "p"), + b.build(), true); + + // comma prevents syn match, even though tokenizer + // skipped it: + assertMatches("a, b c", s, "a p b c"); + + // no comma allows syn match: + assertMatches("a b c", s, "a b c", "x"); + } +}