Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP <+>/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\npackage org.apache.lucene.analysis.miscellaneous;\n\nimport org.apache.lucene.analysis.*;\nimport org.apache.lucene.analysis.Analyzer.TokenStreamComponents;\nimport org.apache.lucene.analysis.core.KeywordTokenizer;\nimport org.apache.lucene.analysis.core.StopFilter;\nimport org.apache.lucene.analysis.cz.CzechStemFilter;\nimport org.apache.lucene.analysis.standard.StandardAnalyzer;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;\nimport org.apache.lucene.analysis.util.CharArraySet;\nimport org.junit.Test;\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.io.StringReader;\nimport java.util.*;\n\nimport static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;\nimport static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;\n\n/**\n * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest\n * TODO: should explicitly test things like protWords and not rely on\n * the factory tests in Solr.\n */\npublic class TestWordDelimiterFilter extends BaseTokenStreamTestCase {\n\n /***\n public void testPerformance() throws IOException {\n String s = \"now is the time-for all good men to come to-the aid of their country.\";\n Token tok = new Token();\n long start = System.currentTimeMillis();\n int ret=0;\n for (int i=0; i<1000000; i++) {\n StringReader r = new StringReader(s);\n TokenStream ts = new WhitespaceTokenizer(r);\n ts = new WordDelimiterFilter(ts, 1,1,1,1,0);\n\n while (ts.next(tok) != null) ret++;\n }\n\n System.out.println(\"ret=\"+ret+\" time=\"+(System.currentTimeMillis()-start));\n }\n ***/\n\n @Test\n public void testOffsets() throws IOException {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n // test that subwords and catenated subwords have\n // the correct offsets.\n WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"foo-bar\", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n\n assertTokenStreamContents(wdf, \n new String[] { \"foo\", \"bar\", \"foobar\" },\n new int[] { 5, 9, 5 }, \n new int[] { 8, 12, 12 },\n null, null, null, null, false);\n\n wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"foo-bar\", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf,\n new String[] { \"foo\", \"bar\", \"foobar\" },\n new int[] { 5, 5, 5 },\n new int[] { 6, 6, 6 },\n null, null, null, null, false);\n }\n \n @Test\n public void testOffsetChange() throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"übelkeit)\", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf,\n new String[] { \"übelkeit\" },\n new int[] { 7 },\n new int[] { 15 });\n }\n \n @Test\n public void testOffsetChange2() throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"(übelkeit\", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf,\n new String[] { \"übelkeit\" },\n new int[] { 8 },\n new int[] { 17 });\n }\n \n @Test\n public void testOffsetChange3() throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"(übelkeit\", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf,\n new String[] { \"übelkeit\" },\n new int[] { 8 },\n new int[] { 16 });\n }\n \n @Test\n public void testOffsetChange4() throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token(\"(foo,bar)\", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf,\n new String[] { \"foo\", \"bar\", \"foobar\"},\n new int[] { 8, 12, 8 },\n new int[] { 11, 15, 15 },\n null, null, null, null, false);\n }\n\n public void doSplit(final String input, String... output) throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(\n new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);\n \n assertTokenStreamContents(wdf, output);\n }\n\n @Test\n public void testSplits() throws Exception {\n doSplit(\"basic-split\",\"basic\",\"split\");\n doSplit(\"camelCase\",\"camel\",\"Case\");\n\n // non-space marking symbol shouldn't cause split\n // this is an example in Thai \n doSplit(\"\\u0e1a\\u0e49\\u0e32\\u0e19\",\"\\u0e1a\\u0e49\\u0e32\\u0e19\");\n // possessive followed by delimiter\n doSplit(\"test's'\", \"test\");\n\n // some russian upper and lowercase\n doSplit(\"Роберт\", \"Роберт\");\n // now cause a split (russian camelCase)\n doSplit(\"РобЕрт\", \"Роб\", \"Ерт\");\n\n // a composed titlecase character, don't split\n doSplit(\"aDžungla\", \"aDžungla\");\n \n // a modifier letter, don't split\n doSplit(\"ســـــــــــــــــلام\", \"ســـــــــــــــــلام\");\n \n // enclosing mark, don't split\n doSplit(\"test⃝\", \"test⃝\");\n \n // combining spacing mark (the virama), don't split\n doSplit(\"हिन्दी\", \"हिन्दी\");\n \n // don't split non-ascii digits\n doSplit(\"١٢٣٤\", \"١٢٣٤\");\n \n // don't split supplementaries into unpaired surrogates\n doSplit(\"𠀀𠀀\", \"𠀀𠀀\");\n }\n \n public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {\n int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;\n flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;\n WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(\n new StringReader(input), MockTokenizer.KEYWORD, false), flags, null);\n\n assertTokenStreamContents(wdf, output);\n }\n \n /*\n * Test option that allows disabling the special \"'s\" stemming, instead treating the single quote like other delimiters. \n */\n @Test\n public void testPossessives() throws Exception {\n doSplitPossessive(1, \"ra's\", \"ra\");\n doSplitPossessive(0, \"ra's\", \"ra\", \"s\");\n }\n \n /*\n * Set a large position increment gap of 10 if the token is \"largegap\" or \"/\"\n */\n private final class LargePosIncTokenFilter extends TokenFilter {\n private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);\n private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);\n \n protected LargePosIncTokenFilter(TokenStream input) {\n super(input);\n }\n\n @Override\n public boolean incrementToken() throws IOException {\n if (input.incrementToken()) {\n if (termAtt.toString().equals(\"largegap\") || termAtt.toString().equals(\"/\"))\n posIncAtt.setPositionIncrement(10);\n return true;\n } else {\n return false;\n }\n } \n }\n \n @Test\n public void testPositionIncrements() throws Exception {\n final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;\n final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet(Arrays.asList(\"NUTCH\")), false);\n \n /* analyzer that uses whitespace + wdf */\n Analyzer a = new Analyzer() {\n @Override\n public TokenStreamComponents createComponents(String field, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);\n return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(\n tokenizer,\n flags, protWords));\n }\n };\n\n /* in this case, works as expected. */\n assertAnalyzesTo(a, \"LUCENE / SOLR\", new String[] { \"LUCENE\", \"SOLR\" },\n new int[] { 0, 9 },\n new int[] { 6, 13 },\n null,\n new int[] { 1, 1 },\n null,\n false);\n \n /* only in this case, posInc of 2 ?! */\n assertAnalyzesTo(a, \"LUCENE / solR\", new String[] { \"LUCENE\", \"sol\", \"R\", \"solR\" },\n new int[] { 0, 9, 12, 9 },\n new int[] { 6, 12, 13, 13 },\n null,\n new int[] { 1, 1, 1, 0 },\n null,\n false);\n \n assertAnalyzesTo(a, \"LUCENE / NUTCH SOLR\", new String[] { \"LUCENE\", \"NUTCH\", \"SOLR\" },\n new int[] { 0, 9, 15 },\n new int[] { 6, 14, 19 },\n null,\n new int[] { 1, 1, 1 },\n null,\n false);\n \n /* analyzer that will consume tokens with large position increments */\n Analyzer a2 = new Analyzer() {\n @Override\n public TokenStreamComponents createComponents(String field, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);\n return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(\n new LargePosIncTokenFilter(tokenizer),\n flags, protWords));\n }\n };\n \n /* increment of \"largegap\" is preserved */\n assertAnalyzesTo(a2, \"LUCENE largegap SOLR\", new String[] { \"LUCENE\", \"largegap\", \"SOLR\" },\n new int[] { 0, 7, 16 },\n new int[] { 6, 15, 20 },\n null,\n new int[] { 1, 10, 1 },\n null,\n false);\n \n /* the \"/\" had a position increment of 10, where did it go?!?!! */\n assertAnalyzesTo(a2, \"LUCENE / SOLR\", new String[] { \"LUCENE\", \"SOLR\" },\n new int[] { 0, 9 },\n new int[] { 6, 13 },\n null,\n new int[] { 1, 11 },\n null,\n false);\n \n /* in this case, the increment of 10 from the \"/\" is carried over */\n assertAnalyzesTo(a2, \"LUCENE / solR\", new String[] { \"LUCENE\", \"sol\", \"R\", \"solR\" },\n new int[] { 0, 9, 12, 9 },\n new int[] { 6, 12, 13, 13 },\n null,\n new int[] { 1, 11, 1, 0 },\n null,\n false);\n \n assertAnalyzesTo(a2, \"LUCENE / NUTCH SOLR\", new String[] { \"LUCENE\", \"NUTCH\", \"SOLR\" },\n new int[] { 0, 9, 15 },\n new int[] { 6, 14, 19 },\n null,\n new int[] { 1, 11, 1 },\n null,\n false);\n\n Analyzer a3 = new Analyzer() {\n @Override\n public TokenStreamComponents createComponents(String field, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);\n StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,\n tokenizer, StandardAnalyzer.STOP_WORDS_SET);\n filter.setEnablePositionIncrements(true);\n return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));\n }\n };\n\n assertAnalyzesTo(a3, \"lucene.solr\", \n new String[] { \"lucene\", \"solr\", \"lucenesolr\" },\n new int[] { 0, 7, 0 },\n new int[] { 6, 11, 11 },\n null,\n new int[] { 1, 1, 0 },\n null,\n false);\n\n /* the stopword should add a gap here */\n assertAnalyzesTo(a3, \"the lucene.solr\", \n new String[] { \"lucene\", \"solr\", \"lucenesolr\" }, \n new int[] { 4, 11, 4 }, \n new int[] { 10, 15, 15 },\n null,\n new int[] { 2, 1, 0 },\n null,\n false);\n }\n \n /** blast some random strings through the analyzer */\n public void testRandomStrings() throws Exception {\n int numIterations = atLeast(5);\n for (int i = 0; i < numIterations; i++) {\n final int flags = random().nextInt(512);\n final CharArraySet protectedWords;\n if (random().nextBoolean()) {\n protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet(Arrays.asList(\"a\", \"b\", \"cd\")), false);\n } else {\n protectedWords = null;\n }\n \n Analyzer a = new Analyzer() {\n \n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);\n return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));\n }\n };\n checkRandomData(random(), a, 200, 20, false, false);\n }\n }\n \n public void testEmptyTerm() throws IOException {\n Random random = random();\n for (int i = 0; i < 512; i++) {\n final int flags = i;\n final CharArraySet protectedWords;\n if (random.nextBoolean()) {\n protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet(Arrays.asList(\"a\", \"b\", \"cd\")), false);\n } else {\n protectedWords = null;\n }\n \n Analyzer a = new Analyzer() { \n @Override\n protected TokenStreamComponents createComponents(String fieldName, Reader reader) {\n Tokenizer tokenizer = new KeywordTokenizer(reader);\n return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));\n }\n };\n // depending upon options, this thing may or may not preserve the empty term\n checkAnalysisConsistency(random, a, random.nextBoolean(), \"\");\n }\n }\n}\n =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (revision 8c99c92eb8da890a771d5f69b17bf5909bdc4f63) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (revision ) @@ -18,14 +18,15 @@ package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.CharsRef; import org.junit.Test; import java.io.IOException; @@ -376,5 +377,48 @@ // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } + } + + public void testAbbreviationSynonyms() throws IOException { + final int flags = GENERATE_WORD_PARTS; + final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet(Arrays.asList("NUTCH")), false); + + Analyzer a = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + SynonymMap.Builder b = new SynonymMap.Builder(false); + b.add(new CharsRef("fuzzy.suggester"), new CharsRef("smart.autocomplete"), true); + b.add(new CharsRef("fuzzy"), new CharsRef("spelling"), true); + SynonymMap synMap = null; + try { + synMap = b.build(); + } catch (IOException e) { + fail(e.getMessage()); + } + SynonymFilter filter = new SynonymFilter(tokenizer, synMap, true); + return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords)); + } + }; + + // just check if the synonym filter works + assertAnalyzesTo(a, "fuzzy suggester", + new String[] { "fuzzy", "spelling", "suggester" }, + new int[] { 0, 0, 6 }, + new int[] { 5, 5, 15 }, + null, + new int[] { 1, 0, 1 }, + null, + false); + + assertAnalyzesTo(a, "fuzzy.suggester", + new String[] { "fuzzy", "smart", "suggester", "autocomplete" }, + new int[] { 0, 0, 6, 6 }, + new int[] { 5, 5, 15, 15 }, + null, + new int[] { 1, 0, 1, 0 }, + null, + false); + } }