Index: solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (revision 940789) +++ solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (working copy) @@ -17,120 +17,25 @@ package org.apache.solr.analysis; -import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; -import org.apache.lucene.analysis.charfilter.MappingCharFilter; -import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +/** Simple Tests to ensure this factory is working */ public class TestPatternTokenizerFactory extends BaseTokenTestCase { - public void testSplitting() throws Exception - { - String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" - String[][] tests = { - // group pattern input output - { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" }, - { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" }, - { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" }, - { "-1", ":", "boo:and:foo", "boo and foo" }, - { "-1", "o", "boo:and:foo", "b :and:f" }, - { "0", ":", "boo:and:foo", ": :" }, - { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, - { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } - }; - - - Map args = new HashMap(); - for( String[] test : tests ) { - args.put( PatternTokenizerFactory.GROUP, test[0] ); - args.put( PatternTokenizerFactory.PATTERN, test[1] ); + public void testFactory() throws Exception { + final String INPUT = "Günther Günther is here"; - PatternTokenizerFactory tokenizer = new PatternTokenizerFactory(); - tokenizer.init( args ); - - TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); - String out = tsToString( stream ); - // System.out.println( test[2] + " ==> " + out ); - - assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); - - // Make sure it is the same as if we called 'split' - // test disabled, as we remove empty tokens - /*if( "-1".equals( test[0] ) ) { - String[] split = test[2].split( test[1] ); - stream = tokenizer.create( new StringReader( test[2] ) ); - int i=0; - for( Token t = stream.next(); null != t; t = stream.next() ) - { - assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); - } - }*/ - } - } - - public void testOffsetCorrection() throws Exception { - final String INPUT = "Günther Günther is here"; - - // create MappingCharFilter - MappingCharFilterFactory cfFactory = new MappingCharFilterFactory(); - List mappingRules = new ArrayList(); - mappingRules.add( "\"ü\" => \"ü\"" ); - NormalizeCharMap normMap = new NormalizeCharMap(); - cfFactory.parseRules( mappingRules, normMap ); - CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); - // create PatternTokenizer Map args = new HashMap(); args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); tokFactory.init( args ); - TokenStream stream = tokFactory.create( charStream ); + TokenStream stream = tokFactory.create( new StringReader(INPUT) ); assertTokenStreamContents(stream, - new String[] { "Günther", "Günther", "is", "here" }, - new int[] { 0, 13, 26, 29 }, - new int[] { 12, 25, 28, 33 }); - - charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); - args.put( PatternTokenizerFactory.PATTERN, "Günther" ); - args.put( PatternTokenizerFactory.GROUP, "0" ); - tokFactory = new PatternTokenizerFactory(); - tokFactory.init( args ); - stream = tokFactory.create( charStream ); - assertTokenStreamContents(stream, - new String[] { "Günther", "Günther" }, - new int[] { 0, 13 }, - new int[] { 12, 25 }); + new String[] { "Günther", "Günther", "is", "here" }); } - - /** - * TODO: rewrite tests not to use string comparison. - * @deprecated only tests TermAttribute! - */ - private static String tsToString(TokenStream in) throws IOException { - StringBuilder out = new StringBuilder(); - CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); - // extra safety to enforce, that the state is not preserved and also - // assign bogus values - in.clearAttributes(); - termAtt.setEmpty().append("bogusTerm"); - while (in.incrementToken()) { - if (out.length() > 0) - out.append(' '); - out.append(termAtt.toString()); - in.clearAttributes(); - termAtt.setEmpty().append("bogusTerm"); - } - - in.close(); - return out.toString(); - } } Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (revision 940789) +++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (working copy) @@ -21,7 +21,6 @@ import java.io.StringReader; import java.util.HashMap; import java.util.Map; -import java.util.regex.Pattern; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; @@ -29,11 +28,9 @@ import org.apache.lucene.analysis.WhitespaceTokenizer; /** - * - * @version $Id$ - * + * Simple tests to ensure this factory is working */ -public class TestPatternReplaceCharFilter extends BaseTokenTestCase { +public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase { // 1111 // 01234567890123 @@ -86,99 +83,4 @@ new int[] { 0 }, new int[] { 8 }); } - - // 11111 - // 012345678901234 - // aa bb cc dd - // aa##bb###cc dd - public void test1block1matchLonger() throws IOException { - final String BLOCK = "aa bb cc dd"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", - CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa##bb###cc", "dd" }, - new int[] { 0, 9 }, - new int[] { 8, 11 }); - } - - // 01234567 - // a a - // aa aa - public void test1block2matchLonger() throws IOException { - final String BLOCK = " a a"; - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", - CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa", "aa" }, - new int[] { 1, 4 }, - new int[] { 2, 5 }); - } - - // 11111 - // 012345678901234 - // aa bb cc dd - // aa#bb dd - public void test1block1matchShorter() throws IOException { - final String BLOCK = "aa bb cc dd"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", - CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa#bb", "dd" }, - new int[] { 0, 12 }, - new int[] { 11, 14 }); - } - - // 111111111122222222223333 - // 0123456789012345678901234567890123 - // aa bb cc --- aa bb aa bb cc - // aa bb cc --- aa bb aa bb cc - public void test1blockMultiMatches() throws IOException { - final String BLOCK = " aa bb cc --- aa bb aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", - CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, - new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, - new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }); - } - - // 11111111112222222222333333333 - // 012345678901234567890123456789012345678 - // aa bb cc --- aa bb aa. bb aa bb cc - // aa##bb cc --- aa##bb aa. bb aa##bb cc - public void test2blocksMultiMatches() throws IOException { - final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", - CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, - new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, - new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }); - } - - // 11111111112222222222333333333 - // 012345678901234567890123456789012345678 - // a bb - ccc . --- bb a . ccc ccc bb - // aa b - c . --- b aa . c c b - public void testChain() throws IOException { - final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".", - CharReader.get( new StringReader( BLOCK ) ) ); - cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); - cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); - assertTokenStreamContents(ts, - new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, - new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, - new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }); - } - - private Pattern pattern( String p ){ - return Pattern.compile( p ); - } } Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (revision 940789) +++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (working copy) @@ -21,61 +21,25 @@ import org.apache.lucene.analysis.WhitespaceTokenizer; import java.io.StringReader; -import java.util.regex.Pattern; +import java.util.HashMap; +import java.util.Map; /** - * @version $Id:$ + * Simple tests to ensure this factory is working */ -public class TestPatternReplaceFilter extends BaseTokenTestCase { +public class TestPatternReplaceFilterFactory extends BaseTokenTestCase { public void testReplaceAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), - Pattern.compile("a*b"), - "-", true); + PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "a*b"); + args.put("replacement", "-"); + factory.init(args); + TokenStream ts = factory.create + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); + assertTokenStreamContents(ts, new String[] { "-foo-foo-foo-", "-", "c-" }); } - - public void testReplaceFirst() throws Exception { - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), - Pattern.compile("a*b"), - "-", false); - assertTokenStreamContents(ts, - new String[] { "-fooaabfooabfoob", "-", "c-" }); - } - - public void testStripFirst() throws Exception { - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), - Pattern.compile("a*b"), - null, false); - assertTokenStreamContents(ts, - new String[] { "fooaabfooabfoob", "", "c" }); - } - - public void testStripAll() throws Exception { - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), - Pattern.compile("a*b"), - null, true); - assertTokenStreamContents(ts, - new String[] { "foofoofoo", "", "c" }); - } - - public void testReplaceAllWithBackRef() throws Exception { - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), - Pattern.compile("(a*)b"), - "$1\\$", true); - assertTokenStreamContents(ts, - new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" }); - } - } Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java (deleted) =================================================================== Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (revision 940789) +++ solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; import org.apache.solr.common.SolrException; Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (revision 940789) +++ solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (working copy) @@ -17,6 +17,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; import java.util.Map; import java.util.regex.Pattern; Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/PatternTokenizer.java (deleted) =================================================================== Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (revision 940789) +++ solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (working copy) @@ -22,6 +22,7 @@ import java.util.regex.PatternSyntaxException; import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; /** * Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 940789) +++ lucene/contrib/CHANGES.txt (working copy) @@ -92,6 +92,9 @@ stemming. Add Turkish and Romanian stopwords lists to support this. (Robert Muir, Uwe Schindler, Simon Willnauer) + * LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the + pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir) + New features * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser. @@ -165,6 +168,8 @@ into subwords and performs optional transformations on subword groups. - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which filters out Tokens at the same position and Term text as the previous token. + - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a + CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes. (... in progress) Build Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (revision 0) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (working copy) @@ -15,8 +15,9 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; @@ -26,12 +27,12 @@ /** * @version $Id:$ */ -public class TestPatternReplaceFilter extends BaseTokenTestCase { +public class TestPatternReplaceFilter extends BaseTokenStreamTestCase { public void testReplaceAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), Pattern.compile("a*b"), "-", true); assertTokenStreamContents(ts, @@ -41,7 +42,7 @@ public void testReplaceFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), Pattern.compile("a*b"), "-", false); assertTokenStreamContents(ts, @@ -51,7 +52,7 @@ public void testStripFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), Pattern.compile("a*b"), null, false); assertTokenStreamContents(ts, @@ -61,7 +62,7 @@ public void testStripAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), Pattern.compile("a*b"), null, true); assertTokenStreamContents(ts, @@ -71,7 +72,7 @@ public void testReplaceAllWithBackRef() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), Pattern.compile("(a*)b"), "$1\\$", true); assertTokenStreamContents(ts, Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (revision 0) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (working copy) @@ -15,15 +15,15 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; +import java.util.regex.Pattern; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.charfilter.MappingCharFilter; @@ -31,7 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -public class TestPatternTokenizerFactory extends BaseTokenTestCase +public class TestPatternTokenizer extends BaseTokenStreamTestCase { public void testSplitting() throws Exception { @@ -48,16 +48,8 @@ { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } }; - - Map args = new HashMap(); - for( String[] test : tests ) { - args.put( PatternTokenizerFactory.GROUP, test[0] ); - args.put( PatternTokenizerFactory.PATTERN, test[1] ); - - PatternTokenizerFactory tokenizer = new PatternTokenizerFactory(); - tokenizer.init( args ); - - TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); + for( String[] test : tests ) { + TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0])); String out = tsToString( stream ); // System.out.println( test[2] + " ==> " + out ); @@ -81,30 +73,21 @@ final String INPUT = "Günther Günther is here"; // create MappingCharFilter - MappingCharFilterFactory cfFactory = new MappingCharFilterFactory(); List mappingRules = new ArrayList(); mappingRules.add( "\"ü\" => \"ü\"" ); NormalizeCharMap normMap = new NormalizeCharMap(); - cfFactory.parseRules( mappingRules, normMap ); + normMap.add("ü", "ü"); CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); // create PatternTokenizer - Map args = new HashMap(); - args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); - PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); - tokFactory.init( args ); - TokenStream stream = tokFactory.create( charStream ); + TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1); assertTokenStreamContents(stream, new String[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }); charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); - args.put( PatternTokenizerFactory.PATTERN, "Günther" ); - args.put( PatternTokenizerFactory.GROUP, "0" ); - tokFactory = new PatternTokenizerFactory(); - tokFactory.init( args ); - stream = tokFactory.create( charStream ); + stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0); assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (revision 0) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (working copy) @@ -15,39 +15,31 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; import java.util.regex.Pattern; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; /** - * - * @version $Id$ - * + * Tests {@link PatternReplaceCharFilter} */ -public class TestPatternReplaceCharFilter extends BaseTokenTestCase { +public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { // 1111 // 01234567890123 // this is test. public void testNothingChange() throws IOException { final String BLOCK = "this is test."; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - args.put("replacement", "$1$2$3"); - factory.init(args); - CharStream cs = factory.create( + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, new int[] { 0, 5, 8 }, @@ -58,13 +50,9 @@ // aa bb cc public void testReplaceByEmpty() throws IOException { final String BLOCK = "aa bb cc"; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - factory.init(args); - CharStream cs = factory.create( + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertFalse(ts.incrementToken()); } @@ -73,14 +61,9 @@ // aa#bb#cc public void test1block1matchSameLength() throws IOException { final String BLOCK = "aa bb cc"; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - args.put("replacement", "$1#$2#$3"); - factory.init(args); - CharStream cs = factory.create( + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, new int[] { 0 }, @@ -95,7 +78,7 @@ final String BLOCK = "aa bb cc dd"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb###cc", "dd" }, new int[] { 0, 9 }, @@ -109,7 +92,7 @@ final String BLOCK = " a a"; CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa", "aa" }, new int[] { 1, 4 }, @@ -124,7 +107,7 @@ final String BLOCK = "aa bb cc dd"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, new int[] { 0, 12 }, @@ -139,7 +122,7 @@ final String BLOCK = " aa bb cc --- aa bb aa bb cc"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, @@ -154,7 +137,7 @@ final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, @@ -171,7 +154,7 @@ CharReader.get( new StringReader( BLOCK ) ) ); cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (revision 940789) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (working copy) @@ -62,8 +62,10 @@ * pat.tokenStream("content", "James is running round in the woods"), * "English")); * - * + * @deprecated use the pattern-based analysis in the analysis/pattern package instead. + * This analyzer will be removed in a future release (4.1) */ +@Deprecated public final class PatternAnalyzer extends Analyzer { /** "\\W+"; Divides text at non-letters (NOT Character.isLetter(c)) */ Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (revision 0) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -35,7 +35,6 @@ * string. *

* - * @version $Id:$ * @see Pattern */ public final class PatternReplaceFilter extends TokenFilter { Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 0) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.io.Reader; @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.commons.io.IOUtils; /** * This tokenizer uses regex pattern matching to construct distinct tokens @@ -51,7 +50,6 @@ *

*

NOTE: This Tokenizer does not output tokens that are of zero length.

* - * @version $Id$ * @see Pattern */ public final class PatternTokenizer extends Tokenizer { @@ -59,7 +57,7 @@ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private String str; + private final StringBuilder str = new StringBuilder(); private int index; private final Pattern pattern; @@ -71,7 +69,7 @@ super(input); this.pattern = pattern; this.group = group; - str = IOUtils.toString(input); + fillBuffer(str, input); matcher = pattern.matcher(str); index = 0; } @@ -84,11 +82,11 @@ // match a specific group while (matcher.find()) { - final String match = matcher.group(group); - if (match.length() == 0) continue; - termAtt.setEmpty().append(match); index = matcher.start(group); - offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group))); + final int endIndex = matcher.end(group); + if (index == endIndex) continue; + termAtt.setEmpty().append(str, index, endIndex); + offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); return true; } @@ -131,9 +129,19 @@ @Override public void reset(Reader input) throws IOException { super.reset(input); - str = IOUtils.toString(input); + fillBuffer(str, input); matcher.reset(str); index = 0; } - + + // TODO: we should see if we can make this tokenizer work without reading + // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? + final char[] buffer = new char[8192]; + private void fillBuffer(StringBuilder sb, Reader input) throws IOException { + int len; + sb.setLength(0); + while ((len = input.read(buffer)) > 0) { + sb.append(buffer, 0, len); + } + } } Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (revision 0) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.util.LinkedList; @@ -45,7 +45,6 @@ * highlight snippet="aa1<em>23bb</em>" *

* - * @version $Id$ * @since Solr 1.5 */ public class PatternReplaceCharFilter extends BaseCharFilter { Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0) @@ -0,0 +1,22 @@ + + + + +Set of components for pattern-based (regex) analysis. + + Property changes on: lucene\contrib\analyzers\common\src\java\org\apache\lucene\analysis\pattern\package.html ___________________________________________________________________ Added: svn:eol-style + native