Index: contrib/synonyms/pom.xml.template =================================================================== --- contrib/synonyms/pom.xml.template (wersja 0) +++ contrib/synonyms/pom.xml.template (wersja 0) @@ -0,0 +1,38 @@ + + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-synonyms + Lucene Synonyms + @version@ + + TokenFilter adding single and multi-word synonyms to the token stream. + + jar + Index: contrib/synonyms/src/test/org/apache/lucene/index/synonyms/SynonymFilterTest.java =================================================================== --- contrib/synonyms/src/test/org/apache/lucene/index/synonyms/SynonymFilterTest.java (wersja 0) +++ contrib/synonyms/src/test/org/apache/lucene/index/synonyms/SynonymFilterTest.java (wersja 0) @@ -0,0 +1,232 @@ +package org.apache.lucene.index.synonyms; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.*; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.*; +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.*; +import org.apache.lucene.search.highlight.*; +import org.apache.lucene.store.RAMDirectory; + +/** + * Test {@link SynonymFilter}. + */ +@SuppressWarnings("unused") +public class SynonymFilterTest extends TestCase { + /* + * Static set of synonyms. + */ + private final static SynonymTree synonymTree; + static { + final Map synonyms = new HashMap(); + synonyms.put( + new String[] + { "new", "york" }, + new String[][] { + { "big", "apple" }, + { "new", "york", "city" } }); + synonyms.put( + new String[] { + "national", "basketball", "association" }, + new String[][] { + { "nba" } }); + + synonymTree = new SynonymTree(synonyms.entrySet().iterator()); + } + + private IndexReader ir; + private IndexSearcher is; + + /* + * Close any opened stuff. + */ + public void tearDown() { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + // Ignore. + } + is = null; + } + } + + /** + * Test phrase queries. + */ + public void testPhraseQueries() throws IOException { + buildIndex("pre new york post"); + + // Display terms indexed at all positions. + // TermPositionVector tfv = (TermPositionVector) ir.getTermFreqVector(0, "content"); + // System.out.println(toString(tfv)); + + // Check phrase queries for synonyms only. + assertEquals(1, search("\"big apple\"").totalHits); + assertEquals(1, search("\"new york city\"").totalHits); + + // Check phrase queries overlapping synonyms on the left and right. + assertEquals(1, search("\"pre big apple\"").totalHits); + assertEquals(1, search("\"pre new york city\"").totalHits); + assertEquals(1, search("\"big apple post\"").totalHits); + + // TODO: This will not work for synonyms of different length than the original + // token sequence. I doubt the method implemented in this filter can be + // bent to supporting such a use case. + // assertEquals(1, search("\"new york city post\"").totalHits); + } + + /** + * Test synonym positions (using the highlighter). + */ + public void testHighlightPositions() throws IOException { + final String text = "pre new york post"; + buildIndex(text); + + assertEquals("pre >new york< post", highlight("\"big apple\"", text)); + + // TODO: This example highlights differently than expected. In the "new york city" + // phrase, all tokens point to the same offset range (full synonym phrase). The + // highlighter returns duplicated ">new york<>new york<" for some reason. + // assertEquals("pre >new york< post", highlight("\"new york city\"", text)); + + // TODO: This example passes, but is awkward. For every word in the + // synonym, we match the entire original synonym sequence. This causes weird + // side-effect of highlighting the entire phrase when part of the synonym + // is matched. + assertEquals("pre >new york< post", highlight("big", text)); + } + + /* + * + */ + private String highlight(String query, String text) throws IOException { + Highlighter highlighter = new Highlighter( + new UnderlineFormatter(), + new QueryScorer(createQuery(query))); + highlighter.setTextFragmenter(new NullFragmenter()); + + try { + return highlighter.getBestFragment(createAnalyzer(), "content", text); + } catch (InvalidTokenOffsetsException e) { + throw new RuntimeException(e); + } + } + + /* + * + */ + private TopDocs search(String query) throws IOException { + TopDocs td = is.search(createQuery(query), 10); + return td; + } + + /* + * + */ + private Query createQuery(String query) throws IOException { + try { + QueryParser qp = new QueryParser("content", createAnalyzer()); + return qp.parse(query); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + + /* + * + */ + private String toString(TermPositionVector tfv) { + int maxPosition = -1; + for (int index = 0; index < tfv.size(); index++) + for (int position : tfv.getTermPositions(index)) + maxPosition = Math.max(maxPosition, position); + + int [] positionCount = new int [maxPosition + 1]; + for (int index = 0; index < tfv.size(); index++) + for (int position : tfv.getTermPositions(index)) + positionCount[position]++; + + String [][] tokens = new String [maxPosition + 1][]; + for (int i = 0; i <= maxPosition; i++) { + tokens[i] = new String[positionCount[i]]; + } + + String [] terms = tfv.getTerms(); + for (int index = 0; index < tfv.size(); index++) + { + for (int position : tfv.getTermPositions(index)) + tokens[position][--positionCount[position]] = terms[index]; + } + + StringBuilder builder = new StringBuilder(); + for (int index = 0; index <= maxPosition; index++) { + String [] t = tokens[index]; + Arrays.sort(t); + builder.append(":" + index + "="); + builder.append(Arrays.toString(t)); + } + + return builder.toString(); + } + + /* + * + */ + private void buildIndex(String... phrases) throws IOException { + RAMDirectory directory = new RAMDirectory(); + Analyzer analyzer = createAnalyzer(); + IndexWriter writer = new IndexWriter(directory, analyzer, + MaxFieldLength.UNLIMITED); + + for (String phrase : phrases) { + Document doc = new Document(); + doc.add(new Field("content", phrase, Store.YES, Index.ANALYZED, + TermVector.WITH_POSITIONS_OFFSETS)); + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + + this.is = new IndexSearcher(directory); + this.ir = is.getIndexReader(); + } + + /* + * + */ + private Analyzer createAnalyzer() { + return new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new SynonymFilter(new WhitespaceTokenizer(reader), synonymTree); + } + }; + } +} Index: contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java =================================================================== --- contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java (wersja 0) +++ contrib/synonyms/src/test/org/apache/lucene/index/synonyms/UnderlineFormatter.java (wersja 0) @@ -0,0 +1,17 @@ +package org.apache.lucene.index.synonyms; + +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.TokenGroup; + +/** + * + */ +final class UnderlineFormatter implements Formatter { + public String highlightTerm(String originalText, TokenGroup tokenGroup) { + if (tokenGroup.getTotalScore() <= 0) { + return originalText; + } + + return ">" + originalText + "<"; + } +} Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java =================================================================== --- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java (wersja 0) +++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymFilter.java (wersja 0) @@ -0,0 +1,223 @@ +package org.apache.lucene.index.synonyms; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.index.synonyms.SynonymTree.Node; + +/** + * A {@link TokenFilter} that matches the input sequence of {@link Token}s + * against a set of predefined synonyms (sequences of tokens) and emits a + * synonym phrase if a match is found. + *

+ * This class still uses {@link TokenFilter} API for backwards compatibility. + */ +@SuppressWarnings("deprecation") +public final class SynonymFilter extends TokenFilter { + /** + * Synonym tokens have their type set to this value. + * + * @see Token#type() + */ + public static final String SYNONYM_TYPE = ""; + + /** + * Cached synonym tree. + */ + private final SynonymTree synonyms; + + /** + * Current list of matching token sequences in the tree. We need this list so + * that we can wait for the longest match to appear. + */ + private final List nodes = new ArrayList(); + + /* + * + */ + private static class TokenEntry { + Token token; + + TokenEntry previous; + TokenEntry next; + } + + private final List tokens = new ArrayList(); + private final List allTokens = new ArrayList(); + + /** */ + private TokenEntry head; + + /* + * + */ + public SynonymFilter(TokenStream input, SynonymTree synonyms) { + super(input); + this.synonyms = synonyms; + } + + /* + * + */ + @Override + public Token next(Token reusableToken) throws IOException { + if (head != null) { + head = head.next; + + if (head == null) return null; + return head.token; + } + + head = new TokenEntry(); + allTokens.add(head); + + /* + * Cache all the tokens from the input. This is done to avoid many special + * cases and quirks (lookahead is required anyway to process certain + * synonyms). + */ + Token t = reusableToken; + while ((t = input.next(t)) != null) { + addToken((Token) t.clone()); + } + + /* + * Process all tokens from the input, looking for synonym matches and adding + * synonyms to the linked list of tokens to be returned. + */ + final int max = tokens.size(); + for (int j = 0; j < max; j++) { + final TokenEntry te = tokens.get(j); + t = te.token; + final String tokenImage = new String(t.termBuffer(), 0, t.termLength()); + + /* + * Add the root node on the list so that we can start from single word + * synonyms of the current token. + */ + nodes.add(synonyms.getRootNode()); + + /* + * Process the list of current matches and try to extend each one with the + * current token's text. If successful, the match remains on the list, + * waiting for the next token. Otherwise the set of synonyms is found for + * the longest matching subsequence. + */ + for (int i = 0; i < nodes.size();) { + Node current = nodes.get(i); + final Node child = current.getChild(tokenImage); + if (child != null) { + /* + * There is a longer sequence match, descend along this edge and + * continue, leaving the node on the list. + */ + nodes.set(i, child); + + // Leave the node on the matching list and go on to the next node. + i++; + continue; + } + + /* + * Remove the current node from the list. Find the first node on the + * path to the root that contains synonyms (longest match). Place them + * on the queue. + */ + nodes.remove(i); + while (current != null && !current.hasSynonyms()) { + current = current.getParent(); + } + + /* + * For non-root nodes, there should always be a synonym set. + */ + if (current != null) { + int synonymTokenCount = getSynonymTokenCount(current); + + int startOffset = tokens.get(j - synonymTokenCount).token.startOffset(); + int endOffset = tokens.get(j - 1).token.endOffset();; + + for (String[] synonymTokens : current.getSynonyms()) { + int startIndex = j - synonymTokenCount; + + for (String synonymToken : synonymTokens) { + // Add a synonym token at this position. + final Token tt = new Token(); + tt.setPositionIncrement(0); + tt.setStartOffset(startOffset); + tt.setEndOffset(endOffset); + tt.setTermBuffer(synonymToken); + tt.setType(SYNONYM_TYPE); + + addSynonymToken(startIndex++, tt); + } + } + } + } + } + + return next(null); + } + + /** */ + private void addSynonymToken(int position, Token t) { + if (position >= tokens.size()) { + t.setPositionIncrement(1); + addToken(t); + } else { + TokenEntry te = new TokenEntry(); + te.token = t; + + TokenEntry parent = tokens.get(position); + te.previous = parent; + te.next = parent.next; + if (te.next != null) te.next.previous = te; + parent.next = te; + + allTokens.add(te); + } + } + + /** */ + private void addToken(Token t) { + TokenEntry te = new TokenEntry(); + te.token = t; + + te.previous = (tokens.size() == 0 ? head : tokens.get(tokens.size() - 1)); + while (te.previous.next != null) te.previous = te.previous.next; + te.previous.next = te; + te.next = null; + + allTokens.add(te); + tokens.add(te); + } + + /** + * Get the count of synonym tokens. This could be cached in the {@link Node}. + */ + private int getSynonymTokenCount(Node current) { + int count = 0; + while ((current = current.getParent()) != null) + count++; + return count; + } +} Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java =================================================================== --- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java (wersja 0) +++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/SynonymTree.java (wersja 0) @@ -0,0 +1,158 @@ +package org.apache.lucene.index.synonyms; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.*; + +/** + * In-memory, tree-like representation of synonyms. It makes sense to create a + * single {@link SynonymTree} and reuse it further on for all + * {@link SynonymFilter} that use it (instances of this class are immutable and + * thread-safe). + */ +public final class SynonymTree { + /** + * Representation of internal tree state. + */ + static class Node { + /** + * Parent of this node or null if root. + */ + private final Node parent; + + /** + * A set of synonyms at this node. + */ + private String[][] synonyms; + + /* + * We could make a single hashmap here, indexed by (Node, String) pair to + * save some space. + */ + private final HashMap children = new HashMap(); + + Node(Node parent) { + this.parent = parent; + } + + public boolean hasSynonyms() { + return synonyms != null; + } + + public String[][] getSynonyms() { + return synonyms; + } + + public Node getParent() { + return parent; + } + + public Node getChild(String tokenImage) { + return children.get(tokenImage); + } + + Node getOrCreateChild(String term) { + Node child = children.get(term); + if (child == null) { + child = new Node(this); + children.put(term, child); + } + return child; + } + + void setSynonyms(String[][] value) { + this.synonyms = value; + } + } + + /** + * Root node of the tree. + */ + private final Node root; + + /** + * Create a {@link SynonymTree}. + * + * The constructor requires an iterator over a set of unique keys and + * associated values. Each key is a sequence of source token images. Each + * value is a sequence of sequences of token images of synonyms. Example: + *

+   * final Map synonyms = new HashMap();
+   * synonyms.put(
+   *   new String [] {"new", "york"}, 
+   *   new String [][] {
+   *               {"big", "apple"}, 
+   *               {"new", "york", "city"}});
+   * synonyms.put(
+   *   new String [] {"national", "basketball", "association"},
+   *   new String [][] {
+   *               {"nba"}});
+   * 
+   * SynonymTree t = new SynonymTree(synonyms.entrySet().iterator());
+   * 
Note that there is no bidirectional relation here - in the + * above example nba will not be expanded. + */ + public SynonymTree(Iterator> synonyms) { + this.root = createTree(synonyms); + } + + /** + * @return Return true if the set of synonyms is empty. + */ + public boolean isEmpty() { + return root == null; + } + + /** + * Return the root node for the set of synonyms. + */ + Node getRootNode() { + if (root == null) + throw new IllegalStateException(); + return root; + } + + /** + * Build internal lookup structures. + */ + private Node createTree(Iterator> synonyms) { + if (!synonyms.hasNext()) { + return null; + } + + final Node root = new Node(null); + while (synonyms.hasNext()) { + final Map.Entry e = synonyms.next(); + final String[] pathTerms = e.getKey(); + + Node c = root; + for (String term : pathTerms) { + c = c.getOrCreateChild(term); + } + + if (c.getSynonyms() != null) { + throw new IllegalArgumentException( + "Synonym node already has a set of synonyms: " + + Arrays.toString(pathTerms)); + } + + c.setSynonyms(e.getValue()); + } + return root; + } +} Index: contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html =================================================================== --- contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html (wersja 0) +++ contrib/synonyms/src/java/org/apache/lucene/index/synonyms/package.html (wersja 0) @@ -0,0 +1,17 @@ + +TokenFilter adding single and multi-word synonyms to the token stream. Index: contrib/synonyms/build.xml =================================================================== --- contrib/synonyms/build.xml (wersja 0) +++ contrib/synonyms/build.xml (wersja 0) @@ -0,0 +1,27 @@ + + + + + + + + TokenFilter adding single and multi-word synonyms to the token stream. + + + +