Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1471290) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy) @@ -470,6 +470,12 @@ return Pattern.compile("a"); } }); + + put(Pattern[].class, new ArgProducer() { + @Override public Object create(Random random) { + return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}; + } + }); put(PayloadEncoder.class, new ArgProducer() { @Override public Object create(Random random) { return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java (revision 0) @@ -0,0 +1,626 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Reader; +import java.io.StringReader; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +public class TestPatternCaptureGroupTokenFilter extends BaseTokenStreamTestCase { + + public void testNoPattern() throws Exception { + testPatterns( + "foobarbaz", + new String[] {}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testNoMatch() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"xx"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"xx"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"xx"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"xx"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testNoCapture() throws Exception { + testPatterns( + "foobarbaz", + new String[] {".."}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {".."}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {".."}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {".."}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testEmptyCapture() throws Exception { + testPatterns( + "foobarbaz", + new String[] {".(y*)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {".(y*)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {".(y*)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {".(y*)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testCaptureAll() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.+)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.+)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.+)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.+)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testCaptureStart() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.)"}, + new String[] {"f"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.)"}, + new String[] {"foobarbaz","f"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)"}, + new String[] {"f","b","b"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)"}, + new String[] {"foo","f","bar","b","baz","b"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureMiddle() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^.(.)."}, + new String[] {"o"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^.(.)."}, + new String[] {"foobarbaz","o"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^.(.)."}, + new String[] {"o","a","a"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^.(.)."}, + new String[] {"foo","o","bar","a","baz","a"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.)$"}, + new String[] {"z"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.)$"}, + new String[] {"foobarbaz","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)$"}, + new String[] {"o","r","z"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)$"}, + new String[] {"foo","o","bar","r","baz","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureStartMiddle() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.)(.)"}, + new String[] {"f","o"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.)(.)"}, + new String[] {"foobarbaz","f","o"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)(.)"}, + new String[] {"f","o","b","a","b","a"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)(.)"}, + new String[] {"foo","f","o","bar","b","a","baz","b","a"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testCaptureStartEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.).+(.)$"}, + new String[] {"f","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.).+(.)$"}, + new String[] {"foobarbaz","f","z"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.).+(.)$"}, + new String[] {"f","o","b","r","b","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.).+(.)$"}, + new String[] {"foo","f","o","bar","b","r","baz","b","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testCaptureMiddleEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.)(.)$"}, + new String[] {"a","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.)(.)$"}, + new String[] {"foobarbaz","a","z"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)(.)$"}, + new String[] {"o","o","a","r","a","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)(.)$"}, + new String[] {"foo","o","o","bar","a","r","baz","a","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testMultiCaptureOverlap() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,0,0,0,0,0,0}, + new int[] {9,9,9,9,9,9,9,9,9}, + new int[] {1,0,0,0,0,0,0,0,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.(.(.)))"}, + new String[] {"foobarbaz","foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,0,0,0,0,0,0,0}, + new int[] {9,9,9,9,9,9,9,9,9,9}, + new int[] {1,0,0,0,0,0,0,0,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testMultiPattern() throws Exception { + testPatterns( + "aaabbbaaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + false + ); + testPatterns( + "aaabbbaaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaabbbaaa","aaa","bbb","aaa"}, + new int[] {0,0,0,0}, + new int[] {9,9,9,9}, + new int[] {1,0,0,0}, + true + ); + + testPatterns( + "aaa bbb aaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "aaa bbb aaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + + public void testCamelCase() throws Exception { + testPatterns( + "letsPartyLIKEits1999_dude", + new String[] { + "([A-Z]{2,})", + "(? + * For example, a pattern like: + *
+ * + *
+ * "(https?://([a-zA-Z\-_0-9.]+))"
+ *
+ * when matched against the string "http://www.foo.com/index" would return the + * tokens "https://www.foo.com" and "www.foo.com". + *
+ * + *+ * If none of the patterns match, or if preserveOriginal is true, the original + * token will be preserved. + *
+ *
+ * Each pattern is matched as often as it can be, so the pattern
+ * "(...)", when matched against "abcdefghi" would
+ * produce ["abc","def","ghi"]
+ *
+ * A camelCaseFilter could be written as: + *
+ *
+ *
+ * "([A-Z]{2,})",
+ *
+ * "(?<![A-Z])([A-Z][a-z]+)",
+ * "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
+ * "([0-9]+)"
+ *
+ * plus if {@link #preserveOriginal} is true, it would also return
+ * "camelCaseFilter
+ *
+ * <fieldType name="text_ptncapturegroup" class="solr.TextField" positionIncrementGap="100"> + * <analyzer> + * <tokenizer class="solr.KeywordTokenizerFactory"/> + * <filter class="solr.PatternCaptureGroupTokenFilter" pattern="([^a-z])" preserve_original="true"/> + * </analyzer> + * </fieldType>+ * + * @see PatternCaptureGroupTokenFilter + */ +public class PatternCaptureGroupFilterFactory extends TokenFilterFactory { + private Pattern pattern; + private boolean preserveOriginal = true; + + public PatternCaptureGroupFilterFactory(Map