();
+ synonyms.put(
+ new String[]
+ { "new", "york" },
+ new String[][] {
+ { "big", "apple" },
+ { "new", "york", "city" } });
+ synonyms.put(
+ new String[] {
+ "national", "basketball", "association" },
+ new String[][] {
+ { "nba" } });
+
+ synonymTree = new SynonymTree(synonyms.entrySet().iterator());
+ }
+
+ private IndexReader ir;
+ private IndexSearcher is;
+
+ /*
+ * Close any opened stuff.
+ */
+ public void tearDown() {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ // Ignore.
+ }
+ is = null;
+ }
+ }
+
+ /**
+ * Test phrase queries.
+ */
+ public void testPhraseQueries() throws IOException {
+ buildIndex("pre new york post");
+
+ // Display terms indexed at all positions.
+ // TermPositionVector tfv = (TermPositionVector) ir.getTermFreqVector(0, "content");
+ // System.out.println(toString(tfv));
+
+ // Check phrase queries for synonyms only.
+ assertEquals(1, search("\"big apple\"").totalHits);
+ assertEquals(1, search("\"new york city\"").totalHits);
+
+ // Check phrase queries overlapping synonyms on the left and right.
+ assertEquals(1, search("\"pre big apple\"").totalHits);
+ assertEquals(1, search("\"pre new york city\"").totalHits);
+ assertEquals(1, search("\"big apple post\"").totalHits);
+
+ // TODO: This will not work for synonyms of different length than the original
+ // token sequence. I doubt the method implemented in this filter can be
+ // bent to supporting such a use case.
+ // assertEquals(1, search("\"new york city post\"").totalHits);
+ }
+
+ /**
+ * Test synonym positions (using the highlighter).
+ */
+ public void testHighlightPositions() throws IOException {
+ final String text = "pre new york post";
+ buildIndex(text);
+
+ assertEquals("pre >new york< post", highlight("\"big apple\"", text));
+
+ // TODO: This example highlights differently than expected. In the "new york city"
+ // phrase, all tokens point to the same offset range (full synonym phrase). The
+ // highlighter returns duplicated ">new york<>new york<" for some reason.
+ // assertEquals("pre >new york< post", highlight("\"new york city\"", text));
+
+ // TODO: This example passes, but is awkward. For every word in the
+ // synonym, we match the entire original synonym sequence. This causes weird
+ // side-effect of highlighting the entire phrase when part of the synonym
+ // is matched.
+ assertEquals("pre >new york< post", highlight("big", text));
+ }
+
+ /*
+ *
+ */
+ private String highlight(String query, String text) throws IOException {
+ Highlighter highlighter = new Highlighter(
+ new UnderlineFormatter(),
+ new QueryScorer(createQuery(query)));
+ highlighter.setTextFragmenter(new NullFragmenter());
+
+ try {
+ return highlighter.getBestFragment(createAnalyzer(), "content", text);
+ } catch (InvalidTokenOffsetsException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /*
+ *
+ */
+ private TopDocs search(String query) throws IOException {
+ TopDocs td = is.search(createQuery(query), 10);
+ return td;
+ }
+
+ /*
+ *
+ */
+ private Query createQuery(String query) throws IOException {
+ try {
+ QueryParser qp = new QueryParser("content", createAnalyzer());
+ return qp.parse(query);
+ } catch (ParseException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /*
+ *
+ */
+ private String toString(TermPositionVector tfv) {
+ int maxPosition = -1;
+ for (int index = 0; index < tfv.size(); index++)
+ for (int position : tfv.getTermPositions(index))
+ maxPosition = Math.max(maxPosition, position);
+
+ int [] positionCount = new int [maxPosition + 1];
+ for (int index = 0; index < tfv.size(); index++)
+ for (int position : tfv.getTermPositions(index))
+ positionCount[position]++;
+
+ String [][] tokens = new String [maxPosition + 1][];
+ for (int i = 0; i <= maxPosition; i++) {
+ tokens[i] = new String[positionCount[i]];
+ }
+
+ String [] terms = tfv.getTerms();
+ for (int index = 0; index < tfv.size(); index++)
+ {
+ for (int position : tfv.getTermPositions(index))
+ tokens[position][--positionCount[position]] = terms[index];
+ }
+
+ StringBuilder builder = new StringBuilder();
+ for (int index = 0; index <= maxPosition; index++) {
+ String [] t = tokens[index];
+ Arrays.sort(t);
+ builder.append(":" + index + "=");
+ builder.append(Arrays.toString(t));
+ }
+
+ return builder.toString();
+ }
+
+ /*
+ *
+ */
+ private void buildIndex(String... phrases) throws IOException {
+ RAMDirectory directory = new RAMDirectory();
+ Analyzer analyzer = createAnalyzer();
+ IndexWriter writer = new IndexWriter(directory, analyzer,
+ MaxFieldLength.UNLIMITED);
+
+ for (String phrase : phrases) {
+ Document doc = new Document();
+ doc.add(new Field("content", phrase, Store.YES, Index.ANALYZED,
+ TermVector.WITH_POSITIONS_OFFSETS));
+ writer.addDocument(doc);
+ }
+ writer.commit();
+ writer.close();
+
+ this.is = new IndexSearcher(directory);
+ this.ir = is.getIndexReader();
+ }
+
+ /*
+ *
+ */
+ private Analyzer createAnalyzer() {
+ return new Analyzer() {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new SynonymFilter(new WhitespaceTokenizer(reader), synonymTree);
+ }
+ };
+ }
+}
Index: contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java
===================================================================
--- contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java (wersja 0)
+++ contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java (wersja 0)
@@ -0,0 +1,17 @@
+package org.apache.lucene.index.synonyms;
+
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.highlight.TokenGroup;
+
+/**
+ *
+ */
+final class UnderlineFormatter implements Formatter {
+ public String highlightTerm(String originalText, TokenGroup tokenGroup) {
+ if (tokenGroup.getTotalScore() <= 0) {
+ return originalText;
+ }
+
+ return ">" + originalText + "<";
+ }
+}
Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java
===================================================================
--- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java (wersja 0)
+++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java (wersja 0)
@@ -0,0 +1,223 @@
+package org.apache.lucene.index.synonyms;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.index.synonyms.SynonymTree.Node;
+
+/**
+ * A {@link TokenFilter} that matches the input sequence of {@link Token}s
+ * against a set of predefined synonyms (sequences of tokens) and emits a
+ * synonym phrase if a match is found.
+ *
+ * This class still uses {@link TokenFilter} API for backwards compatibility.
+ */
+@SuppressWarnings("deprecation")
+public final class SynonymFilter extends TokenFilter {
+ /**
+ * Synonym tokens have their type set to this value.
+ *
+ * @see Token#type()
+ */
+ public static final String SYNONYM_TYPE = "";
+
+ /**
+ * Cached synonym tree.
+ */
+ private final SynonymTree synonyms;
+
+ /**
+ * Current list of matching token sequences in the tree. We need this list so
+ * that we can wait for the longest match to appear.
+ */
+ private final List nodes = new ArrayList();
+
+ /*
+ *
+ */
+ private static class TokenEntry {
+ Token token;
+
+ TokenEntry previous;
+ TokenEntry next;
+ }
+
+ private final List tokens = new ArrayList();
+ private final List allTokens = new ArrayList();
+
+ /** */
+ private TokenEntry head;
+
+ /*
+ *
+ */
+ public SynonymFilter(TokenStream input, SynonymTree synonyms) {
+ super(input);
+ this.synonyms = synonyms;
+ }
+
+ /*
+ *
+ */
+ @Override
+ public Token next(Token reusableToken) throws IOException {
+ if (head != null) {
+ head = head.next;
+
+ if (head == null) return null;
+ return head.token;
+ }
+
+ head = new TokenEntry();
+ allTokens.add(head);
+
+ /*
+ * Cache all the tokens from the input. This is done to avoid many special
+ * cases and quirks (lookahead is required anyway to process certain
+ * synonyms).
+ */
+ Token t = reusableToken;
+ while ((t = input.next(t)) != null) {
+ addToken((Token) t.clone());
+ }
+
+ /*
+ * Process all tokens from the input, looking for synonym matches and adding
+ * synonyms to the linked list of tokens to be returned.
+ */
+ final int max = tokens.size();
+ for (int j = 0; j < max; j++) {
+ final TokenEntry te = tokens.get(j);
+ t = te.token;
+ final String tokenImage = new String(t.termBuffer(), 0, t.termLength());
+
+ /*
+ * Add the root node on the list so that we can start from single word
+ * synonyms of the current token.
+ */
+ nodes.add(synonyms.getRootNode());
+
+ /*
+ * Process the list of current matches and try to extend each one with the
+ * current token's text. If successful, the match remains on the list,
+ * waiting for the next token. Otherwise the set of synonyms is found for
+ * the longest matching subsequence.
+ */
+ for (int i = 0; i < nodes.size();) {
+ Node current = nodes.get(i);
+ final Node child = current.getChild(tokenImage);
+ if (child != null) {
+ /*
+ * There is a longer sequence match, descend along this edge and
+ * continue, leaving the node on the list.
+ */
+ nodes.set(i, child);
+
+ // Leave the node on the matching list and go on to the next node.
+ i++;
+ continue;
+ }
+
+ /*
+ * Remove the current node from the list. Find the first node on the
+ * path to the root that contains synonyms (longest match). Place them
+ * on the queue.
+ */
+ nodes.remove(i);
+ while (current != null && !current.hasSynonyms()) {
+ current = current.getParent();
+ }
+
+ /*
+ * For non-root nodes, there should always be a synonym set.
+ */
+ if (current != null) {
+ int synonymTokenCount = getSynonymTokenCount(current);
+
+ int startOffset = tokens.get(j - synonymTokenCount).token.startOffset();
+ int endOffset = tokens.get(j - 1).token.endOffset();;
+
+ for (String[] synonymTokens : current.getSynonyms()) {
+ int startIndex = j - synonymTokenCount;
+
+ for (String synonymToken : synonymTokens) {
+ // Add a synonym token at this position.
+ final Token tt = new Token();
+ tt.setPositionIncrement(0);
+ tt.setStartOffset(startOffset);
+ tt.setEndOffset(endOffset);
+ tt.setTermBuffer(synonymToken);
+ tt.setType(SYNONYM_TYPE);
+
+ addSynonymToken(startIndex++, tt);
+ }
+ }
+ }
+ }
+ }
+
+ return next(null);
+ }
+
+ /** */
+ private void addSynonymToken(int position, Token t) {
+ if (position >= tokens.size()) {
+ t.setPositionIncrement(1);
+ addToken(t);
+ } else {
+ TokenEntry te = new TokenEntry();
+ te.token = t;
+
+ TokenEntry parent = tokens.get(position);
+ te.previous = parent;
+ te.next = parent.next;
+ if (te.next != null) te.next.previous = te;
+ parent.next = te;
+
+ allTokens.add(te);
+ }
+ }
+
+ /** */
+ private void addToken(Token t) {
+ TokenEntry te = new TokenEntry();
+ te.token = t;
+
+ te.previous = (tokens.size() == 0 ? head : tokens.get(tokens.size() - 1));
+ while (te.previous.next != null) te.previous = te.previous.next;
+ te.previous.next = te;
+ te.next = null;
+
+ allTokens.add(te);
+ tokens.add(te);
+ }
+
+ /**
+ * Get the count of synonym tokens. This could be cached in the {@link Node}.
+ */
+ private int getSynonymTokenCount(Node current) {
+ int count = 0;
+ while ((current = current.getParent()) != null)
+ count++;
+ return count;
+ }
+}
Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java
===================================================================
--- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java (wersja 0)
+++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java (wersja 0)
@@ -0,0 +1,158 @@
+package org.apache.lucene.index.synonyms;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+/**
+ * In-memory, tree-like representation of synonyms. It makes sense to create a
+ * single {@link SynonymTree} and reuse it further on for all
+ * {@link SynonymFilter} that use it (instances of this class are immutable and
+ * thread-safe).
+ */
+public final class SynonymTree {
+ /**
+ * Representation of internal tree state.
+ */
+ static class Node {
+ /**
+ * Parent of this node or null if root.
+ */
+ private final Node parent;
+
+ /**
+ * A set of synonyms at this node.
+ */
+ private String[][] synonyms;
+
+ /*
+ * We could make a single hashmap here, indexed by (Node, String) pair to
+ * save some space.
+ */
+ private final HashMap children = new HashMap();
+
+ Node(Node parent) {
+ this.parent = parent;
+ }
+
+ public boolean hasSynonyms() {
+ return synonyms != null;
+ }
+
+ public String[][] getSynonyms() {
+ return synonyms;
+ }
+
+ public Node getParent() {
+ return parent;
+ }
+
+ public Node getChild(String tokenImage) {
+ return children.get(tokenImage);
+ }
+
+ Node getOrCreateChild(String term) {
+ Node child = children.get(term);
+ if (child == null) {
+ child = new Node(this);
+ children.put(term, child);
+ }
+ return child;
+ }
+
+ void setSynonyms(String[][] value) {
+ this.synonyms = value;
+ }
+ }
+
+ /**
+ * Root node of the tree.
+ */
+ private final Node root;
+
+ /**
+ * Create a {@link SynonymTree}.
+ *
+ * The constructor requires an iterator over a set of unique keys and
+ * associated values. Each key is a sequence of source token images. Each
+ * value is a sequence of sequences of token images of synonyms. Example:
+ *
+ * final Map synonyms = new HashMap();
+ * synonyms.put(
+ * new String [] {"new", "york"},
+ * new String [][] {
+ * {"big", "apple"},
+ * {"new", "york", "city"}});
+ * synonyms.put(
+ * new String [] {"national", "basketball", "association"},
+ * new String [][] {
+ * {"nba"}});
+ *
+ * SynonymTree t = new SynonymTree(synonyms.entrySet().iterator());
+ *
Note that there is no bidirectional relation here - in the
+ * above example nba will not be expanded.
+ */
+ public SynonymTree(Iterator> synonyms) {
+ this.root = createTree(synonyms);
+ }
+
+ /**
+ * @return Return true if the set of synonyms is empty.
+ */
+ public boolean isEmpty() {
+ return root == null;
+ }
+
+ /**
+ * Return the root node for the set of synonyms.
+ */
+ Node getRootNode() {
+ if (root == null)
+ throw new IllegalStateException();
+ return root;
+ }
+
+ /**
+ * Build internal lookup structures.
+ */
+ private Node createTree(Iterator> synonyms) {
+ if (!synonyms.hasNext()) {
+ return null;
+ }
+
+ final Node root = new Node(null);
+ while (synonyms.hasNext()) {
+ final Map.Entry e = synonyms.next();
+ final String[] pathTerms = e.getKey();
+
+ Node c = root;
+ for (String term : pathTerms) {
+ c = c.getOrCreateChild(term);
+ }
+
+ if (c.getSynonyms() != null) {
+ throw new IllegalArgumentException(
+ "Synonym node already has a set of synonyms: "
+ + Arrays.toString(pathTerms));
+ }
+
+ c.setSynonyms(e.getValue());
+ }
+ return root;
+ }
+}
Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html
===================================================================
--- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html (wersja 0)
+++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html (wersja 0)
@@ -0,0 +1,17 @@
+
+TokenFilter adding single and multi-word synonyms to the token stream.
Index: contrib/synonyms/build.xml
===================================================================
--- contrib/synonyms/build.xml (wersja 0)
+++ contrib/synonyms/build.xml (wersja 0)
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+ TokenFilter adding single and multi-word synonyms to the token stream.
+
+
+
+