Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Set;\nimport java.util.TreeSet;\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.CannedTokenStream;\nimport org.apache.lucene.analysis.MockAnalyzer;\nimport org.apache.lucene.analysis.MockTokenFilter;\nimport org.apache.lucene.analysis.MockTokenizer;\nimport org.apache.lucene.analysis.Token;\nimport org.apache.lucene.analysis.TokenFilter;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.TokenStreamToAutomaton;\nimport org.apache.lucene.analysis.Tokenizer;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.search.suggest.Lookup.LookupResult;\nimport org.apache.lucene.search.suggest.TermFreq;\nimport org.apache.lucene.search.suggest.TermFreqArrayIterator;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.LuceneTestCase;\nimport org.apache.lucene.util._TestUtil;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.fst.Util;\n\npublic class FuzzySuggesterTest extends LuceneTestCase {\n \n public void testRandomEdits() throws IOException {\n List keys = new ArrayList();\n int numTerms = atLeast(100);\n for (int i = 0; i < numTerms; i++) {\n keys.add(new TermFreq(\"boo\" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));\n }\n keys.add(new TermFreq(\"foo bar boo far\", 12));\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n int numIters = atLeast(10);\n for (int i = 0; i < numIters; i++) {\n String addRandomEdit = addRandomEdit(\"foo bar boo\", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);\n List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);\n assertEquals(addRandomEdit, 1, results.size());\n assertEquals(\"foo bar boo far\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F); \n }\n }\n \n /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */\n public void testKeyword() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"foo\", 50),\n new TermFreq(\"bar\", 10),\n new TermFreq(\"barbar\", 12),\n new TermFreq(\"barbara\", 6)\n };\n \n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"bariar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbr\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbara\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbara\", results.get(0).key.toString());\n assertEquals(6, results.get(0).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbar\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n results = suggester.lookup(_TestUtil.stringToCharSequence(\"barbaa\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"barbara\", results.get(1).key.toString());\n assertEquals(6, results.get(1).value, 0.01F);\n \n // top N of 2, but only foo is available\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"f\", random()), false, 2);\n assertEquals(1, results.size());\n assertEquals(\"foo\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n \n // top N of 1 for 'bar': we return this even though\n // barbar is higher because exactFirst is enabled:\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"bar\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"bar\", results.get(0).key.toString());\n assertEquals(10, results.get(0).value, 0.01F);\n \n // top N Of 2 for 'b'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"b\", random()), false, 2);\n assertEquals(2, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n \n // top N of 3 for 'ba'\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ba\", random()), false, 3);\n assertEquals(3, results.size());\n assertEquals(\"barbar\", results.get(0).key.toString());\n assertEquals(12, results.get(0).value, 0.01F);\n assertEquals(\"bar\", results.get(1).key.toString());\n assertEquals(10, results.get(1).value, 0.01F);\n assertEquals(\"barbara\", results.get(2).key.toString());\n assertEquals(6, results.get(2).value, 0.01F);\n }\n \n /**\n * basic \"standardanalyzer\" test with stopword removal\n */\n public void testStandard() throws Exception {\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"the ghost of christmas past\", 50),\n };\n \n Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);\n FuzzySuggester suggester = new FuzzySuggester(standard);\n suggester.setPreservePositionIncrements(false);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(_TestUtil.stringToCharSequence(\"the ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' since its a stopword, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost of chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n\n // omit the 'the' and 'of' since they are stopwords, its suggested anyway\n results = suggester.lookup(_TestUtil.stringToCharSequence(\"ghost chris\", random()), false, 1);\n assertEquals(1, results.size());\n assertEquals(\"the ghost of christmas past\", results.get(0).key.toString());\n assertEquals(50, results.get(0).value, 0.01F);\n }\n\n public void testNoSeps() throws Exception {\n TermFreq[] keys = new TermFreq[] {\n new TermFreq(\"ab cd\", 0),\n new TermFreq(\"abcd\", 1),\n };\n\n int options = 0;\n\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n // TODO: would be nice if \"ab \" would allow the test to\n // pass, and more generally if the analyzer can know\n // that the user's current query has ended at a word, \n // but, analyzers don't produce SEP tokens!\n List r = suggester.lookup(_TestUtil.stringToCharSequence(\"ab c\", random()), false, 2);\n assertEquals(2, r.size());\n\n // With no PRESERVE_SEPS specified, \"ab c\" should also\n // complete to \"abcd\", which has higher weight so should\n // appear first:\n assertEquals(\"abcd\", r.get(0).key.toString());\n }\n\n public void testGraphDups() throws Exception {\n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"slow\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"wi\",1,1),\n token(\"hotspot\",0,3),\n token(\"fi\",1,1),\n token(\"network\",1,1),\n token(\"is\",1,1),\n token(\"fast\",1,1)\n\n }),\n new CannedTokenStream(new Token[] {\n token(\"wifi\",1,1),\n token(\"hotspot\",0,2),\n token(\"network\",1,1)\n }),\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"wifi network is slow\", 50),\n new TermFreq(\"wi fi network is fast\", 10),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n \n List results = suggester.lookup(\"wifi network\", false, 10);\n if (VERBOSE) {\n System.out.println(\"Results: \" + results);\n }\n assertEquals(2, results.size());\n assertEquals(\"wifi network is slow\", results.get(0).key);\n assertEquals(50, results.get(0).value);\n assertEquals(\"wi fi network is fast\", results.get(1).key);\n assertEquals(10, results.get(1).value);\n }\n\n public void testEmpty() throws Exception {\n FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));\n suggester.build(new TermFreqArrayIterator(new TermFreq[0]));\n\n List result = suggester.lookup(\"a\", false, 20);\n assertTrue(result.isEmpty());\n }\n\n public void testInputPathRequired() throws Exception {\n\n // SynonymMap.Builder b = new SynonymMap.Builder(false);\n // b.add(new CharsRef(\"ab\"), new CharsRef(\"ba\"), true);\n // final SynonymMap map = b.build();\n\n // The Analyzer below mimics the functionality of the SynonymAnalyzer\n // using the above map, so that the suggest module does not need a dependency on the \n // synonym module \n\n final Analyzer analyzer = new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n int tokenStreamCounter = 0;\n final TokenStream[] tokenStreams = new TokenStream[] {\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"xc\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ba\",1,1), \n token(\"xd\",1,1)\n }),\n new CannedTokenStream(new Token[] {\n token(\"ab\",1,1),\n token(\"ba\",0,1),\n token(\"x\",1,1)\n })\n };\n\n @Override\n public TokenStream getTokenStream() {\n TokenStream result = tokenStreams[tokenStreamCounter];\n tokenStreamCounter++;\n return result;\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n\n TermFreq keys[] = new TermFreq[] {\n new TermFreq(\"ab xc\", 50),\n new TermFreq(\"ba xd\", 50),\n };\n FuzzySuggester suggester = new FuzzySuggester(analyzer);\n suggester.build(new TermFreqArrayIterator(keys));\n List results = suggester.lookup(\"ab x\", false, 1);\n assertTrue(results.size() == 1);\n }\n\n private static Token token(String term, int posInc, int posLength) {\n final Token t = new Token(term, 0, 0);\n t.setPositionIncrement(posInc);\n t.setPositionLength(posLength);\n return t;\n }\n\n /*\n private void printTokens(final Analyzer analyzer, String input) throws IOException {\n System.out.println(\"Tokens for \" + input);\n TokenStream ts = analyzer.tokenStream(\"\", new StringReader(input));\n ts.reset();\n final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);\n \n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n System.out.println(String.format(\"%s,%s,%s\", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); \n }\n ts.end();\n ts.close();\n } \n */ \n\n private final Analyzer getUnusualAnalyzer() {\n return new Analyzer() {\n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);\n \n return new TokenStreamComponents(tokenizer) {\n\n int count;\n\n @Override\n public TokenStream getTokenStream() {\n // 4th time we are called, return tokens a b,\n // else just a:\n if (count++ != 3) {\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n });\n } else {\n // After that \"a b\":\n return new CannedTokenStream(new Token[] {\n token(\"a\", 1, 1),\n token(\"b\", 1, 1),\n });\n }\n }\n \n @Override\n protected void setReader(final Reader reader) throws IOException {\n }\n };\n }\n };\n }\n\n public void testExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n //System.out.println(\"ALL: \" + suggester.lookup(\"x y\", false, 6));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"x y\", false, topN);\n //System.out.println(\"topN=\" + topN + \" \" + results);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"x y\", results.get(0).key);\n assertEquals(1, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"z z z\", results.get(1).key);\n assertEquals(20, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x y z\", results.get(2).key);\n assertEquals(3, results.get(2).value);\n\n if (topN > 3) {\n assertEquals(\"x\", results.get(3).key);\n assertEquals(2, results.get(3).value);\n }\n }\n }\n }\n }\n\n public void testNonExactFirst() throws Exception {\n\n Analyzer a = getUnusualAnalyzer();\n FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);\n\n suggester.build(new TermFreqArrayIterator(new TermFreq[] {\n new TermFreq(\"x y\", 1),\n new TermFreq(\"x y z\", 3),\n new TermFreq(\"x\", 2),\n new TermFreq(\"z z z\", 20),\n }));\n\n for(int topN=1;topN<6;topN++) {\n List results = suggester.lookup(\"p\", false, topN);\n\n assertEquals(Math.min(topN, 4), results.size());\n\n assertEquals(\"z z z\", results.get(0).key);\n assertEquals(20, results.get(0).value);\n\n if (topN > 1) {\n assertEquals(\"x y z\", results.get(1).key);\n assertEquals(3, results.get(1).value);\n\n if (topN > 2) {\n assertEquals(\"x\", results.get(2).key);\n assertEquals(2, results.get(2).value);\n \n if (topN > 3) {\n assertEquals(\"x y\", results.get(3).key);\n assertEquals(1, results.get(3).value);\n }\n }\n }\n }\n }\n \n // Holds surface form separately:\n private static class TermFreq2 implements Comparable {\n public final String surfaceForm;\n public final String analyzedForm;\n public final long weight;\n\n public TermFreq2(String surfaceForm, String analyzedForm, long weight) {\n this.surfaceForm = surfaceForm;\n this.analyzedForm = analyzedForm;\n this.weight = weight;\n }\n\n @Override\n public int compareTo(TermFreq2 other) {\n int cmp = analyzedForm.compareTo(other.analyzedForm);\n if (cmp != 0) {\n return cmp;\n } else if (weight > other.weight) {\n return -1;\n } else if (weight < other.weight) {\n return 1;\n } else {\n assert false;\n return 0;\n }\n }\n }\n\n static boolean isStopChar(char ch, int numStopChars) {\n //System.out.println(\"IS? \" + ch + \": \" + (ch - 'a') + \": \" + ((ch - 'a') < numStopChars));\n return (ch - 'a') < numStopChars;\n }\n\n // Like StopFilter:\n private static class TokenEater extends TokenFilter {\n private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);\n private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private final int numStopChars;\n private final boolean preserveHoles;\n private boolean first;\n\n public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) {\n super(in);\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public void reset() throws IOException {\n super.reset();\n first = true;\n }\n\n @Override\n public final boolean incrementToken() throws IOException {\n int skippedPositions = 0;\n while (input.incrementToken()) {\n if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) {\n int posInc = posIncrAtt.getPositionIncrement() + skippedPositions;\n if (first) {\n if (posInc == 0) {\n // first token having posinc=0 is illegal.\n posInc = 1;\n }\n first = false;\n }\n posIncrAtt.setPositionIncrement(posInc);\n //System.out.println(\"RETURN term=\" + termAtt + \" numStopChars=\" + numStopChars);\n return true;\n }\n if (preserveHoles) {\n skippedPositions += posIncrAtt.getPositionIncrement();\n }\n }\n\n return false;\n }\n }\n\n private static class MockTokenEatingAnalyzer extends Analyzer {\n private int numStopChars;\n private boolean preserveHoles;\n\n public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {\n this.preserveHoles = preserveHoles;\n this.numStopChars = numStopChars;\n }\n\n @Override\n public TokenStreamComponents createComponents(String fieldName, Reader reader) {\n MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);\n tokenizer.setEnableChecks(true);\n TokenStream next;\n if (numStopChars != 0) {\n next = new TokenEater(preserveHoles, tokenizer, numStopChars);\n } else {\n next = tokenizer;\n }\n return new TokenStreamComponents(tokenizer, next);\n }\n }\n\n public void testRandom() throws Exception {\n\n int numQueries = atLeast(100);\n \n final List slowCompletor = new ArrayList();\n final TreeSet allPrefixes = new TreeSet();\n final Set seen = new HashSet();\n \n TermFreq[] keys = new TermFreq[numQueries];\n\n boolean preserveSep = random().nextBoolean();\n\n final int numStopChars = random().nextInt(10);\n final boolean preserveHoles = random().nextBoolean();\n\n if (VERBOSE) {\n System.out.println(\"TEST: \" + numQueries + \" words; preserveSep=\" + preserveSep + \" numStopChars=\" + numStopChars + \" preserveHoles=\" + preserveHoles);\n }\n \n for (int i = 0; i < numQueries; i++) {\n int numTokens = _TestUtil.nextInt(random(), 1, 4);\n String key;\n String analyzedKey;\n while(true) {\n key = \"\";\n analyzedKey = \"\";\n boolean lastRemoved = false;\n for(int token=0;token < numTokens;token++) {\n String s;\n while (true) {\n // TODO: would be nice to fix this slowCompletor/comparator to\n // use full range, but we might lose some coverage too...\n s = _TestUtil.randomSimpleString(random());\n if (s.length() > 0) {\n if (token > 0) {\n key += \" \";\n }\n if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') {\n analyzedKey += \" \";\n }\n key += s;\n if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n analyzedKey += '\\u0000';\n }\n lastRemoved = true;\n } else {\n analyzedKey += s;\n lastRemoved = false;\n }\n break;\n }\n }\n }\n\n analyzedKey = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n\n if (preserveSep && lastRemoved) {\n analyzedKey += \" \";\n }\n\n // Don't add same surface form more than once:\n if (!seen.contains(key)) {\n seen.add(key);\n break;\n }\n }\n\n for (int j = 1; j < key.length(); j++) {\n allPrefixes.add(key.substring(0, j));\n }\n // we can probably do Integer.MAX_VALUE here, but why worry.\n int weight = random().nextInt(1<<24);\n keys[i] = new TermFreq(key, weight);\n\n slowCompletor.add(new TermFreq2(key, analyzedKey, weight));\n }\n\n if (VERBOSE) {\n // Don't just sort original list, to avoid VERBOSE\n // altering the test:\n List sorted = new ArrayList(slowCompletor);\n Collections.sort(sorted);\n for(TermFreq2 ent : sorted) {\n System.out.println(\" surface='\" + ent.surfaceForm + \" analyzed='\" + ent.analyzedForm + \"' weight=\" + ent.weight);\n }\n }\n\n Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);\n FuzzySuggester suggester = new FuzzySuggester(a, a,\n preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);\n suggester.build(new TermFreqArrayIterator(keys));\n\n for (String prefix : allPrefixes) {\n\n if (VERBOSE) {\n System.out.println(\"\\nTEST: prefix=\" + prefix);\n }\n\n final int topN = _TestUtil.nextInt(random(), 1, 10);\n List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN);\n\n // 2. go thru whole set to find suggestions:\n List matches = new ArrayList();\n\n // \"Analyze\" the key:\n String[] tokens = prefix.split(\" \");\n StringBuilder builder = new StringBuilder();\n boolean lastRemoved = false;\n for(int i=0;i 0 && !builder.toString().endsWith(\" \")) {\n builder.append(' ');\n }\n\n if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {\n if (preserveSep && preserveHoles) {\n builder.append(\"\\u0000\");\n }\n lastRemoved = true;\n } else {\n builder.append(token);\n lastRemoved = false;\n }\n }\n\n String analyzedKey = builder.toString();\n\n // Remove trailing sep/holes (TokenStream.end() does\n // not tell us any trailing holes, yet ... there is an\n // issue open for this):\n while (true) {\n String s = analyzedKey.replaceAll(\"(^| )\\u0000$\", \"\");\n s = s.replaceAll(\"\\\\s+$\", \"\");\n if (s.equals(analyzedKey)) {\n break;\n }\n analyzedKey = s;\n }\n\n if (analyzedKey.length() == 0) {\n // Currently suggester can't suggest from the empty\n // string! You get no results, not all results...\n continue;\n }\n\n if (preserveSep && (prefix.endsWith(\" \") || lastRemoved)) {\n analyzedKey += \" \";\n }\n\n if (VERBOSE) {\n System.out.println(\" analyzed: \" + analyzedKey);\n }\n TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();\n\n // NOTE: not great that we ask the suggester to give\n // us the \"answer key\" (ie maybe we have a bug in\n // suggester.toLevA ...) ... but testRandom2() fixes\n // this:\n Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));\n assertTrue(automaton.isDeterministic());\n // TODO: could be faster... but its slowCompletor for a reason\n BytesRef spare = new BytesRef();\n for (TermFreq2 e : slowCompletor) {\n spare.copyChars(e.analyzedForm);\n Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);\n for (IntsRef intsRef : finiteStrings) {\n State p = automaton.getInitialState();\n BytesRef ref = Util.toBytesRef(intsRef, spare);\n boolean added = false;\n for (int i = ref.offset; i < ref.length; i++) {\n State q = p.step(ref.bytes[i] & 0xff);\n if (q == null) {\n break;\n } else if (q.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n added = true;\n break;\n }\n p = q;\n }\n if (!added && p.isAccept()) {\n matches.add(new LookupResult(e.surfaceForm, e.weight));\n } \n }\n }\n\n assertTrue(numStopChars > 0 || matches.size() > 0);\n\n if (matches.size() > 1) {\n Collections.sort(matches, new Comparator() {\n @Override\n public int compare(LookupResult left, LookupResult right) {\n int cmp = Float.compare(right.value, left.value);\n if (cmp == 0) {\n return left.compareTo(right);\n } else {\n return cmp;\n }\n }\n });\n }\n\n if (matches.size() > topN) {\n matches = matches.subList(0, topN);\n }\n\n if (VERBOSE) {\n System.out.println(\" expected:\");\n for(LookupResult lr : matches) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n\n System.out.println(\" actual:\");\n for(LookupResult lr : r) {\n System.out.println(\" key=\" + lr.key + \" weight=\" + lr.value);\n }\n }\n \n assertEquals(prefix + \" \" + topN, matches.size(), r.size());\n for(int hit=0;hit keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"a\", 40),\n new TermFreq(\"a \", 50),\n new TermFreq(\" a\", 60),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n List results = suggester.lookup(\"a\", false, 5);\n assertEquals(2, results.size());\n assertEquals(\" a\", results.get(0).key);\n assertEquals(60, results.get(0).value);\n assertEquals(\"a \", results.get(1).key);\n assertEquals(50, results.get(1).value);\n }\n\n public void testEditSeps() throws Exception {\n Analyzer a = new MockAnalyzer(random());\n FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);\n\n List keys = Arrays.asList(new TermFreq[] {\n new TermFreq(\"foo bar\", 40),\n new TermFreq(\"foo bar baz\", 50),\n new TermFreq(\"barbaz\", 60),\n new TermFreq(\"barbazfoo\", 10),\n });\n\n Collections.shuffle(keys, random());\n suggester.build(new TermFreqArrayIterator(keys));\n\n assertEquals(\"[foo bar baz/50, foo bar/40]\", suggester.lookup(\"foobar\", false, 5).toString());\n assertEquals(\"[foo bar baz/50]\", suggester.lookup(\"foobarbaz\", false, 5).toString());\n assertEquals(\"[barbaz/60, barbazfoo/10]\", suggester.lookup(\"bar baz\", false, 5).toString());\n assertEquals(\"[barbazfoo/10]\", suggester.lookup(\"bar baz foo\", false, 5).toString());\n }\n \n @SuppressWarnings(\"fallthrough\")\n private static String addRandomEdit(String string, int prefixLength) {\n char[] input = string.toCharArray();\n StringBuilder builder = new StringBuilder();\n for (int i = 0; i < input.length; i++) {\n if (i >= prefixLength && random().nextBoolean() && i < input.length-1) {\n switch(random().nextInt(4)) {\n case 3:\n if (i < input.length-1) {\n // Transpose input[i] and input[1+i]:\n builder.append(input[i+1]);\n builder.append(input[i]);\n for(int j=i+2;j answers = new ArrayList();\n final Set seen = new HashSet();\n for(int i=0;i() {\n @Override\n public int compare(TermFreq a, TermFreq b) {\n return a.term.compareTo(b.term);\n }\n });\n if (VERBOSE) {\n System.out.println(\"\\nTEST: targets\");\n for(TermFreq tf : answers) {\n System.out.println(\" \" + tf.term.utf8ToString() + \" freq=\" + tf.v);\n }\n }\n\n Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);\n int maxEdits = random().nextBoolean() ? 1 : 2;\n int prefixLen = random().nextInt(4);\n boolean transpositions = random().nextBoolean();\n // TODO: test graph analyzers\n // TODO: test exactFirst / preserveSep permutations\n FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, prefixLen);\n\n if (VERBOSE) {\n System.out.println(\"TEST: maxEdits=\" + maxEdits + \" prefixLen=\" + prefixLen + \" transpositions=\" + transpositions + \" num=\" + NUM);\n }\n\n Collections.shuffle(answers, random());\n suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));\n\n final int ITERS = atLeast(100);\n for(int iter=0;iter expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);\n if (VERBOSE) {\n System.out.println(\" expected: \" + expected.size());\n for(LookupResult c : expected) {\n System.out.println(\" \" + c);\n }\n }\n final List actual = suggest.lookup(frag, false, NUM);\n if (VERBOSE) {\n System.out.println(\" actual: \" + actual.size());\n for(LookupResult c : actual) {\n System.out.println(\" \" + c);\n }\n }\n\n Collections.sort(actual, new CompareByCostThenAlpha());\n\n final int limit = Math.min(expected.size(), actual.size());\n for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) {\n final List results = new ArrayList();\n final int fragLen = frag.length();\n for(TermFreq tf : answers) {\n //System.out.println(\" check s=\" + tf.term.utf8ToString());\n boolean prefixMatches = true;\n for(int i=0;i= fragLen-maxEdits) {\n // OK it's possible:\n //System.out.println(\" possible\");\n int d;\n final String s = tf.term.utf8ToString();\n if (fragLen == prefixLen) {\n d = 0;\n } else if (false && len < fragLen) {\n d = getDistance(frag, s, allowTransposition);\n } else {\n //System.out.println(\" try loop\");\n d = maxEdits + 1;\n //for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n for(int ed=-maxEdits;ed<=maxEdits;ed++) {\n if (s.length() < fragLen - ed) {\n continue;\n }\n String check = s.substring(0, fragLen-ed);\n d = getDistance(frag, check, allowTransposition);\n //System.out.println(\" sub check s=\" + check + \" d=\" + d);\n if (d <= maxEdits) {\n break;\n }\n }\n }\n if (d <= maxEdits) {\n results.add(new LookupResult(tf.term.utf8ToString(), tf.v));\n }\n }\n }\n\n Collections.sort(results, new CompareByCostThenAlpha());\n }\n\n return results;\n }\n\n private static class CharSequenceComparator implements Comparator {\n\n @Override\n public int compare(CharSequence o1, CharSequence o2) {\n final int l1 = o1.length();\n final int l2 = o2.length();\n \n final int aStop = Math.min(l1, l2);\n for (int i = 0; i < aStop; i++) {\n int diff = o1.charAt(i) - o2.charAt(i);\n if (diff != 0) {\n return diff;\n }\n }\n // One is a prefix of the other, or, they are equal:\n return l1 - l2;\n }\n }\n\n private static final Comparator CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();\n\n public class CompareByCostThenAlpha implements Comparator {\n @Override\n public int compare(LookupResult a, LookupResult b) {\n if (a.value > b.value) {\n return -1;\n } else if (a.value < b.value) {\n return 1;\n } else {\n final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);\n assert c != 0: \"term=\" + a.key;\n return c;\n }\n }\n }\n\n // NOTE: copied from\n // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java\n // and tweaked to return the edit distance not the float\n // lucene measure\n\n /* Finds unicode (code point) Levenstein (edit) distance\n * between two strings, including transpositions. */\n public int getDistance(String target, String other, boolean allowTransposition) {\n IntsRef targetPoints;\n IntsRef otherPoints;\n int n;\n int d[][]; // cost array\n \n // NOTE: if we cared, we could 3*m space instead of m*n space, similar to \n // what LevenshteinDistance does, except cycling thru a ring of three \n // horizontal cost arrays... but this comparator is never actually used by \n // DirectSpellChecker, its only used for merging results from multiple shards \n // in \"distributed spellcheck\", and its inefficient in other ways too...\n\n // cheaper to do this up front once\n targetPoints = toIntsRef(target);\n otherPoints = toIntsRef(other);\n n = targetPoints.length;\n final int m = otherPoints.length;\n d = new int[n+1][m+1];\n \n if (n == 0 || m == 0) {\n if (n == m) {\n return 0;\n }\n else {\n return Math.max(n, m);\n }\n } \n\n // indexes into strings s and t\n int i; // iterates through s\n int j; // iterates through t\n\n int t_j; // jth character of t\n\n int cost; // cost\n\n for (i = 0; i<=n; i++) {\n d[i][0] = i;\n }\n \n for (j = 0; j<=m; j++) {\n d[0][j] = j;\n }\n\n for (j = 1; j<=m; j++) {\n t_j = otherPoints.ints[j-1];\n\n for (i=1; i<=n; i++) {\n cost = targetPoints.ints[i-1]==t_j ? 0 : 1;\n // minimum of cell to the left+1, to the top+1, diagonally left and up +cost\n d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);\n // transposition\n if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {\n d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);\n }\n }\n }\n \n return d[n][m];\n }\n \n private static IntsRef toIntsRef(String s) {\n IntsRef ref = new IntsRef(s.length()); // worst case\n int utf16Len = s.length();\n for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {\n cp = ref.ints[ref.length++] = Character.codePointAt(s, i);\n }\n return ref;\n }\n}\n =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision f81056da25f3671b9807c4a51d6b985389fe916e) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (revision ) @@ -17,17 +17,6 @@ * limitations under the License. */ -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; @@ -51,8 +40,19 @@ import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.fst.Util; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + public class FuzzySuggesterTest extends LuceneTestCase { - + public void testRandomEdits() throws IOException { List keys = new ArrayList(); int numTerms = atLeast(100); @@ -68,10 +68,29 @@ List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); assertEquals(addRandomEdit, 1, results.size()); assertEquals("foo bar boo far", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + + public void testNonLatinRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу фар", 12)); + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу бар", results.get(0).key.toString()); - assertEquals(12, results.get(0).value, 0.01F); + assertEquals(12, results.get(0).value, 0.01F); } } - + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -80,52 +99,52 @@ new TermFreq("barbar", 12), new TermFreq("barbara", 6) }; - + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup(_TestUtil.stringToCharSequence("bariar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbr", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbara", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbara", results.get(0).key.toString()); assertEquals(6, results.get(0).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); - + results = suggester.lookup(_TestUtil.stringToCharSequence("barbaa", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); - + // top N of 2, but only foo is available results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); - + // top N of 1 for 'bar': we return this even though // barbar is higher because exactFirst is enabled: results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); assertEquals(1, results.size()); assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); - + // top N Of 2 for 'b' results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); assertEquals(2, results.size()); @@ -133,7 +152,7 @@ assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); - + // top N of 3 for 'ba' results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); assertEquals(3, results.size()); @@ -144,7 +163,7 @@ assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); } - + /** * basic "standardanalyzer" test with stopword removal */ @@ -152,12 +171,12 @@ TermFreq keys[] = new TermFreq[] { new TermFreq("the ghost of christmas past", 50), }; - + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); FuzzySuggester suggester = new FuzzySuggester(standard); suggester.setPreservePositionIncrements(false); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); @@ -206,7 +225,7 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int tokenStreamCounter = 0; final TokenStream[] tokenStreams = new TokenStream[] { @@ -239,7 +258,7 @@ tokenStreamCounter++; return result; } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -253,7 +272,7 @@ }; FuzzySuggester suggester = new FuzzySuggester(analyzer); suggester.build(new TermFreqArrayIterator(keys)); - + List results = suggester.lookup("wifi network", false, 10); if (VERBOSE) { System.out.println("Results: " + results); @@ -287,7 +306,7 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int tokenStreamCounter = 0; final TokenStream[] tokenStreams = new TokenStream[] { @@ -297,7 +316,7 @@ token("xc",1,1) }), new CannedTokenStream(new Token[] { - token("ba",1,1), + token("ba",1,1), token("xd",1,1) }), new CannedTokenStream(new Token[] { @@ -313,7 +332,7 @@ tokenStreamCounter++; return result; } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -354,14 +373,14 @@ ts.end(); ts.close(); } - */ + */ private final Analyzer getUnusualAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - + return new TokenStreamComponents(tokenizer) { int count; @@ -382,7 +401,7 @@ }); } } - + @Override protected void setReader(final Reader reader) throws IOException { } @@ -457,7 +476,7 @@ if (topN > 2) { assertEquals("x", results.get(2).key); assertEquals(2, results.get(2).value); - + if (topN > 3) { assertEquals("x y", results.get(3).key); assertEquals(1, results.get(3).value); @@ -466,7 +485,7 @@ } } } - + // Holds surface form separately: private static class TermFreq2 implements Comparable { public final String surfaceForm; @@ -572,11 +591,11 @@ public void testRandom() throws Exception { int numQueries = atLeast(100); - + final List slowCompletor = new ArrayList(); final TreeSet allPrefixes = new TreeSet(); final Set seen = new HashSet(); - + TermFreq[] keys = new TermFreq[numQueries]; boolean preserveSep = random().nextBoolean(); @@ -587,7 +606,7 @@ if (VERBOSE) { System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } - + for (int i = 0; i < numQueries; i++) { int numTokens = _TestUtil.nextInt(random(), 1, 4); String key; @@ -752,7 +771,7 @@ } if (!added && p.isAccept()) { matches.add(new LookupResult(e.surfaceForm, e.weight)); - } + } } } @@ -787,7 +806,7 @@ System.out.println(" key=" + lr.key + " weight=" + lr.value); } } - + assertEquals(prefix + " " + topN, matches.size(), r.size()); for(int hit=0;hitUTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>package org.apache.lucene.analysis;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport java.io.IOException;\n\nimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.RollingBuffer;\nimport org.apache.lucene.util.automaton.Automaton;\nimport org.apache.lucene.util.automaton.State;\nimport org.apache.lucene.util.automaton.Transition;\n\n// TODO: maybe also toFST? then we can translate atts into FST outputs/weights\n\n/** Consumes a TokenStream and creates an {@link Automaton}\n * where the transition labels are UTF8 bytes from the {@link\n * TermToBytesRefAttribute}. Between tokens we insert\n * POS_SEP and for holes we insert HOLE.\n *\n * @lucene.experimental */\npublic class TokenStreamToAutomaton {\n\n private boolean preservePositionIncrements;\n\n /** Sole constructor. */\n public TokenStreamToAutomaton() {\n this.preservePositionIncrements = true;\n }\n\n /** Whether to generate holes in the automaton for missing positions, true by default. */\n public void setPreservePositionIncrements(boolean enablePositionIncrements) {\n this.preservePositionIncrements = enablePositionIncrements;\n }\n\n private static class Position implements RollingBuffer.Resettable {\n // Any tokens that ended at our position arrive to this state:\n State arriving;\n\n // Any tokens that start at our position leave from this state:\n State leaving;\n\n @Override\n public void reset() {\n arriving = null;\n leaving = null;\n }\n }\n\n private static class Positions extends RollingBuffer {\n @Override\n protected Position newInstance() {\n return new Position();\n }\n }\n\n /** Subclass & implement this if you need to change the\n * token (such as escaping certain bytes) before it's\n * turned into a graph. */ \n protected BytesRef changeToken(BytesRef in) {\n return in;\n }\n\n /** We create transition between two adjacent tokens. */\n public static final int POS_SEP = 256;\n\n /** We add this arc to represent a hole. */\n public static final int HOLE = 257;\n\n /** Pulls the graph (including {@link\n * PositionLengthAttribute}) from the provided {@link\n * TokenStream}, and creates the corresponding\n * automaton where arcs are bytes from each term. */\n public Automaton toAutomaton(TokenStream in) throws IOException {\n final Automaton a = new Automaton();\n boolean deterministic = true;\n\n final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);\n final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);\n final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);\n final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);\n\n final BytesRef term = termBytesAtt.getBytesRef();\n\n in.reset();\n\n // Only temporarily holds states ahead of our current\n // position:\n\n final RollingBuffer positions = new Positions();\n\n int pos = -1;\n Position posData = null;\n int maxOffset = 0;\n while (in.incrementToken()) {\n int posInc = posIncAtt.getPositionIncrement();\n if (!preservePositionIncrements && posInc > 1) {\n posInc = 1;\n }\n assert pos > -1 || posInc > 0;\n\n if (posInc > 0) {\n\n // New node:\n pos += posInc;\n\n posData = positions.get(pos);\n assert posData.leaving == null;\n\n if (posData.arriving == null) {\n // No token ever arrived to this position\n if (pos == 0) {\n // OK: this is the first token\n posData.leaving = a.getInitialState();\n } else {\n // This means there's a hole (eg, StopFilter\n // does this):\n posData.leaving = new State();\n addHoles(a.getInitialState(), positions, pos);\n }\n } else {\n posData.leaving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n if (posInc > 1) {\n // A token spanned over a hole; add holes\n // \"under\" it:\n addHoles(a.getInitialState(), positions, pos);\n }\n }\n positions.freeBefore(pos);\n } else {\n // note: this isn't necessarily true. its just that we aren't surely det.\n // we could optimize this further (e.g. buffer and sort synonyms at a position)\n // but thats probably overkill. this is cheap and dirty\n deterministic = false;\n }\n\n final int endPos = pos + posLengthAtt.getPositionLength();\n\n termBytesAtt.fillBytesRef();\n final BytesRef term2 = changeToken(term);\n final Position endPosData = positions.get(endPos);\n if (endPosData.arriving == null) {\n endPosData.arriving = new State();\n }\n\n State state = posData.leaving;\n for(int byteIDX=0;byteIDX maxOffset) {\n endState = new State();\n endState.setAccept(true);\n }\n\n pos++;\n while (pos <= positions.getMaxPos()) {\n posData = positions.get(pos);\n if (posData.arriving != null) {\n if (endState != null) {\n posData.arriving.addTransition(new Transition(POS_SEP, endState));\n } else {\n posData.arriving.setAccept(true);\n }\n }\n pos++;\n }\n\n //toDot(a);\n a.setDeterministic(deterministic);\n return a;\n }\n\n // for debugging!\n /*\n private static void toDot(Automaton a) throws IOException {\n final String s = a.toDot();\n Writer w = new OutputStreamWriter(new FileOutputStream(\"/tmp/out.dot\"));\n w.write(s);\n w.close();\n System.out.println(\"TEST: saved to /tmp/out.dot\");\n }\n */\n\n private static void addHoles(State startState, RollingBuffer positions, int pos) {\n Position posData = positions.get(pos);\n Position prevPosData = positions.get(pos-1);\n\n while(posData.arriving == null || prevPosData.leaving == null) {\n if (posData.arriving == null) {\n posData.arriving = new State();\n posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));\n }\n if (prevPosData.leaving == null) {\n if (pos == 1) {\n prevPosData.leaving = startState;\n } else {\n prevPosData.leaving = new State();\n }\n if (prevPosData.arriving != null) {\n prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));\n }\n }\n prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));\n pos--;\n if (pos <= 0) {\n break;\n }\n posData = prevPosData;\n prevPosData = positions.get(pos-1);\n }\n }\n}\n =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision f81056da25f3671b9807c4a51d6b985389fe916e) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision ) @@ -17,8 +17,6 @@ * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; @@ -29,24 +27,32 @@ import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; +import java.io.IOException; + // TODO: maybe also toFST? then we can translate atts into FST outputs/weights -/** Consumes a TokenStream and creates an {@link Automaton} +/** + * Consumes a TokenStream and creates an {@link Automaton} - * where the transition labels are UTF8 bytes from the {@link + * where the transition labels are UTF8 bytes from the {@link - * TermToBytesRefAttribute}. Between tokens we insert + * TermToBytesRefAttribute}. Between tokens we insert - * POS_SEP and for holes we insert HOLE. + * POS_SEP and for holes we insert HOLE. * - * @lucene.experimental */ + * @lucene.experimental + */ public class TokenStreamToAutomaton { private boolean preservePositionIncrements; - /** Sole constructor. */ + /** + * Sole constructor. + */ public TokenStreamToAutomaton() { this.preservePositionIncrements = true; } - /** Whether to generate holes in the automaton for missing positions, true by default. */ + /** + * Whether to generate holes in the automaton for missing positions, true by default. + */ public void setPreservePositionIncrements(boolean enablePositionIncrements) { this.preservePositionIncrements = enablePositionIncrements; } @@ -72,23 +78,31 @@ } } - /** Subclass & implement this if you need to change the + /** + * Subclass & implement this if you need to change the - * token (such as escaping certain bytes) before it's + * token (such as escaping certain bytes) before it's - * turned into a graph. */ + * turned into a graph. + */ protected BytesRef changeToken(BytesRef in) { return in; } - /** We create transition between two adjacent tokens. */ + /** + * We create transition between two adjacent tokens. + */ public static final int POS_SEP = 256; - /** We add this arc to represent a hole. */ + /** + * We add this arc to represent a hole. + */ public static final int HOLE = 257; - /** Pulls the graph (including {@link + /** + * Pulls the graph (including {@link - * PositionLengthAttribute}) from the provided {@link + * PositionLengthAttribute}) from the provided {@link - * TokenStream}, and creates the corresponding + * TokenStream}, and creates the corresponding - * automaton where arcs are bytes from each term. */ + * automaton where arcs are bytes from each term. + */ public Automaton toAutomaton(TokenStream in) throws IOException { final Automaton a = new Automaton(); boolean deterministic = true; @@ -156,16 +170,16 @@ final int endPos = pos + posLengthAtt.getPositionLength(); termBytesAtt.fillBytesRef(); - final BytesRef term2 = changeToken(term); + final char[] term2 = changeToken(term).utf8ToString().toCharArray(); final Position endPosData = positions.get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; - for(int byteIDX=0;byteIDX positions, int pos) { Position posData = positions.get(pos); - Position prevPosData = positions.get(pos-1); + Position prevPosData = positions.get(pos - 1); - while(posData.arriving == null || prevPosData.leaving == null) { + while (posData.arriving == null || prevPosData.leaving == null) { if (posData.arriving == null) { posData.arriving = new State(); posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); @@ -233,7 +247,7 @@ break; } posData = prevPosData; - prevPosData = positions.get(pos-1); + prevPosData = positions.get(pos - 1); } } }