Index: solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java =================================================================== --- solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java (revision 1040997) +++ solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java (working copy) @@ -17,7 +17,6 @@ package org.apache.solr.handler.dataimport; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -import org.apache.lucene.analysis.CharReader; import java.io.IOException; import java.io.StringReader; @@ -74,7 +73,7 @@ StringBuilder out = new StringBuilder(); StringReader strReader = new StringReader(value); try { - HTMLStripCharFilter html = new HTMLStripCharFilter(CharReader.get(strReader.markSupported() ? strReader : new BufferedReader(strReader))); + HTMLStripCharFilter html = new HTMLStripCharFilter(strReader.markSupported() ? strReader : new BufferedReader(strReader)); char[] cbuf = new char[1024 * 10]; while (true) { int count = html.read(cbuf); Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (revision 1040997) +++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (working copy) @@ -22,8 +22,7 @@ import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; @@ -42,8 +41,7 @@ args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); args.put("replacement", "$1$2$3"); factory.init(args); - CharStream cs = factory.create( - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = factory.create( new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, @@ -59,8 +57,7 @@ Map args = new HashMap(); args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); factory.init(args); - CharStream cs = factory.create( - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = factory.create( new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertFalse(ts.incrementToken()); } @@ -75,8 +72,7 @@ args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); args.put("replacement", "$1#$2#$3"); factory.init(args); - CharStream cs = factory.create( - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = factory.create( new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, Index: solr/src/java/org/apache/solr/schema/BoolField.java =================================================================== --- solr/src/java/org/apache/solr/schema/BoolField.java (revision 1040997) +++ solr/src/java/org/apache/solr/schema/BoolField.java (working copy) @@ -82,7 +82,7 @@ } }; - return new TokenStreamInfo(tokenizer, tokenizer); + return new TokenStreamInfo(tokenizer, tokenizer, null); } }; Index: solr/src/java/org/apache/solr/schema/FieldType.java =================================================================== --- solr/src/java/org/apache/solr/schema/FieldType.java (revision 1040997) +++ solr/src/java/org/apache/solr/schema/FieldType.java (working copy) @@ -415,7 +415,7 @@ } }; - return new TokenStreamInfo(ts, ts); + return new TokenStreamInfo(ts, ts, null); } } Index: solr/src/java/org/apache/solr/analysis/CharFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/CharFilterFactory.java (revision 1040997) +++ solr/src/java/org/apache/solr/analysis/CharFilterFactory.java (working copy) @@ -17,9 +17,10 @@ package org.apache.solr.analysis; +import java.io.Reader; import java.util.Map; -import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.CharFilter; /** * @@ -30,5 +31,5 @@ public interface CharFilterFactory { public void init(Map args); public Map getArgs(); - public CharStream create(CharStream input); + public CharFilter create(Reader input); } Index: solr/src/java/org/apache/solr/analysis/SolrAnalyzer.java =================================================================== --- solr/src/java/org/apache/solr/analysis/SolrAnalyzer.java (revision 1040997) +++ solr/src/java/org/apache/solr/analysis/SolrAnalyzer.java (working copy) @@ -36,11 +36,6 @@ return posIncGap; } - /** wrap the reader in a CharStream, if appropriate */ - public Reader charStream(Reader reader){ - return reader; - } - @Override public TokenStream tokenStream(String fieldName, Reader reader) { return getStream(fieldName, reader).getTokenStream(); @@ -49,12 +44,15 @@ public static class TokenStreamInfo { private final Tokenizer tokenizer; private final TokenStream tokenStream; - public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream) { + private final CharFilter charFilterChain; + public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream, CharFilter charFilterChain) { this.tokenizer = tokenizer; this.tokenStream = tokenStream; + this.charFilterChain = charFilterChain; } public Tokenizer getTokenizer() { return tokenizer; } public TokenStream getTokenStream() { return tokenStream; } + public CharFilter getCharFilterChain() { return charFilterChain; } } @@ -65,7 +63,12 @@ // if (true) return tokenStream(fieldName, reader); TokenStreamInfo tsi = (TokenStreamInfo)getPreviousTokenStream(); if (tsi != null) { - tsi.getTokenizer().reset(charStream(reader)); + CharFilter cf = tsi.getCharFilterChain(); + if (cf != null) { + cf.reset(reader); + reader = cf; + } + tsi.getTokenizer().reset(reader); // the consumer will currently call reset() on the TokenStream to hit all the filters. // this isn't necessarily guaranteed by the APIs... but is currently done // by lucene indexing in DocInverterPerField, and in the QueryParser Index: solr/src/java/org/apache/solr/analysis/TokenizerChain.java =================================================================== --- solr/src/java/org/apache/solr/analysis/TokenizerChain.java (revision 1040997) +++ solr/src/java/org/apache/solr/analysis/TokenizerChain.java (working copy) @@ -17,9 +17,8 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.CharStream; -import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Tokenizer; import java.io.Reader; @@ -51,26 +50,20 @@ public TokenizerFactory getTokenizerFactory() { return tokenizer; } public TokenFilterFactory[] getTokenFilterFactories() { return filters; } - @Override - public Reader charStream(Reader reader){ - if( charFilters != null && charFilters.length > 0 ){ - CharStream cs = CharReader.get( reader ); - for (int i=0; i tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); @@ -234,7 +238,7 @@ return tokensNamedLists; } - private String writeCharStream(NamedList out, CharStream input ){ + private String writeCharStream(NamedList out, CharFilter input ){ final int BUFFER_SIZE = 1024; char[] buf = new char[BUFFER_SIZE]; int len = 0; Index: solr/src/webapp/web/admin/analysis.jsp =================================================================== --- solr/src/webapp/web/admin/analysis.jsp (revision 1040997) +++ solr/src/webapp/web/admin/analysis.jsp (working copy) @@ -21,8 +21,7 @@ org.apache.lucene.util.BytesRef, org.apache.lucene.analysis.TokenStream, org.apache.lucene.index.Payload, - org.apache.lucene.analysis.CharReader, - org.apache.lucene.analysis.CharStream, + org.apache.lucene.analysis.CharFilter, org.apache.lucene.analysis.tokenattributes.*, org.apache.solr.analysis.CharFilterFactory, org.apache.solr.analysis.TokenFilterFactory, @@ -191,8 +190,7 @@ if( cfiltfacs != null ){ String source = val; for(CharFilterFactory cfiltfac : cfiltfacs ){ - CharStream reader = CharReader.get(new StringReader(source)); - reader = cfiltfac.create(reader); + CharFilter reader = cfiltfac.create(new StringReader(source)); if(verbose){ writeHeader(out, cfiltfac.getClass(), cfiltfac.getArgs()); source = writeCharStream(out, reader); @@ -200,7 +198,12 @@ } } - TokenStream tstream = tfac.create(tchain.charStream(new StringReader(val))); + Reader reader = new StringReader(val); + if (cfiltfacs != null) { + for (int i = 0; i < cfiltfacs.length; i++) + reader = cfiltfacs[i].create(reader); + } + TokenStream tstream = tfac.create(reader); List tokens = getTokens(tstream); if (verbose) { writeHeader(out, tfac.getClass(), tfac.getArgs()); @@ -495,7 +498,7 @@ out.println(""); } - static String writeCharStream(JspWriter out, CharStream input) throws IOException { + static String writeCharStream(JspWriter out, CharFilter input) throws IOException { out.println(""); out.println(""); Index: modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (revision 1040997) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (working copy) @@ -20,8 +20,7 @@ import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; @@ -47,7 +46,7 @@ } public void testReaderReset() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); char[] buf = new char[10]; int len = cs.read(buf, 0, 10); assertEquals( 1, len ); @@ -63,55 +62,55 @@ } public void testNothingChange() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}); } public void test1to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}); } public void test1to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}); } public void test1to3() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}); } public void test2to4() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}); } public void test2to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}); } public void test3to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}); } public void test4to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}); } public void test5to0() throws Exception { - CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[0]); } @@ -135,7 +134,7 @@ // aa,20,22 => a,20,22 // public void testTokenStream() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) ); + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h i j k ll cccc bbb aa" ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"i","i","jj","kkk","llll","cc","b","a"}, @@ -155,8 +154,8 @@ // ll,5,7 => llllllll,5,7 // h,8,9 => i,8,9 public void testChained() throws Exception { - CharStream cs = new MappingCharFilter( normMap, - new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) ); + CharFilter cs = new MappingCharFilter( normMap, + new MappingCharFilter( normMap, new StringReader( "aaaa ll h" ) ) ); TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[]{"a","llllllll","i"}, Index: modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestCharFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestCharFilter.java (revision 1040997) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestCharFilter.java (working copy) @@ -17,38 +17,37 @@ package org.apache.lucene.analysis.charfilter; +import java.io.Reader; import java.io.StringReader; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; -import org.apache.lucene.analysis.charfilter.CharFilter; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.util.LuceneTestCase; public class TestCharFilter extends LuceneTestCase { public void testCharFilter1() throws Exception { - CharStream cs = new CharFilter1( CharReader.get( new StringReader("") ) ); + CharFilter cs = new CharFilter1( new StringReader("") ); assertEquals( "corrected offset is invalid", 1, cs.correctOffset( 0 ) ); } public void testCharFilter2() throws Exception { - CharStream cs = new CharFilter2( CharReader.get( new StringReader("") ) ); + CharFilter cs = new CharFilter2( new StringReader("") ); assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) ); } public void testCharFilter12() throws Exception { - CharStream cs = new CharFilter2( new CharFilter1( CharReader.get( new StringReader("") ) ) ); + CharFilter cs = new CharFilter2( new CharFilter1( new StringReader("") ) ); assertEquals( "corrected offset is invalid", 3, cs.correctOffset( 0 ) ); } public void testCharFilter11() throws Exception { - CharStream cs = new CharFilter1( new CharFilter1( CharReader.get( new StringReader("") ) ) ); + CharFilter cs = new CharFilter1( new CharFilter1( new StringReader("") ) ); assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) ); } static class CharFilter1 extends CharFilter { - protected CharFilter1(CharStream in) { + protected CharFilter1(Reader in) { super(in); } @@ -60,7 +59,7 @@ static class CharFilter2 extends CharFilter { - protected CharFilter2(CharStream in) { + protected CharFilter2(Reader in) { super(in); } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (revision 1040997) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (working copy) @@ -26,8 +26,6 @@ import java.util.HashSet; import java.util.Set; -import org.apache.lucene.analysis.CharReader; - import org.apache.lucene.util.LuceneTestCase; public class HTMLStripCharFilterTest extends LuceneTestCase { @@ -50,7 +48,7 @@ String gold = " this is some text here is a link and " + "another link . " + "This is an entity: & plus a <. Here is an &. "; - HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html))); + HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html)); StringBuilder builder = new StringBuilder(); int ch = -1; char [] goldArray = gold.toCharArray(); @@ -68,7 +66,7 @@ //Some sanity checks, but not a full-fledged check public void testHTML() throws Exception { InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html"); - HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8"))); + HTMLStripCharFilter reader = new HTMLStripCharFilter(new InputStreamReader(stream, "UTF-8")); StringBuilder builder = new StringBuilder(); int ch = -1; while ((ch = reader.read()) != -1){ @@ -88,7 +86,7 @@ String gold = "\u0393"; Set set = new HashSet(); set.add("reserved"); - Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); + Reader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ @@ -105,7 +103,7 @@ String gold = " \u00DCbermensch = \u0393 bar \u0393"; Set set = new HashSet(); set.add("reserved"); - Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); + Reader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ @@ -122,7 +120,7 @@ String gold = " ! @ and ’"; Set set = new HashSet(); set.add("reserved"); - Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); + Reader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ @@ -138,7 +136,7 @@ String test = "aaa bbb eeee ffff "; Set set = new HashSet(); set.add("reserved"); - Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); + Reader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ @@ -155,7 +153,7 @@ public void testMalformedHTML() throws Exception { String test = "a > "; String gold = "a \"ü\"" ); NormalizeCharMap normMap = new NormalizeCharMap(); normMap.add("ü", "ü"); - CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); + CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1); @@ -86,7 +85,7 @@ new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }); - charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); + charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0); assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, Index: modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (revision 1040997) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (working copy) @@ -22,8 +22,7 @@ import java.util.regex.Pattern; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; @@ -37,8 +36,8 @@ // this is test. public void testNothingChange() throws IOException { final String BLOCK = "this is test."; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, @@ -50,8 +49,8 @@ // aa bb cc public void testReplaceByEmpty() throws IOException { final String BLOCK = "aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertFalse(ts.incrementToken()); } @@ -61,8 +60,8 @@ // aa#bb#cc public void test1block1matchSameLength() throws IOException { final String BLOCK = "aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, @@ -76,8 +75,8 @@ // aa##bb###cc dd public void test1block1matchLonger() throws IOException { final String BLOCK = "aa bb cc dd"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb###cc", "dd" }, @@ -90,8 +89,8 @@ // aa aa public void test1block2matchLonger() throws IOException { final String BLOCK = " a a"; - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa", "aa" }, @@ -105,8 +104,8 @@ // aa#bb dd public void test1block1matchShorter() throws IOException { final String BLOCK = "aa bb cc dd"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, @@ -120,8 +119,8 @@ // aa bb cc --- aa bb aa bb cc public void test1blockMultiMatches() throws IOException { final String BLOCK = " aa bb cc --- aa bb aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, @@ -135,8 +134,8 @@ // aa##bb cc --- aa##bb aa. bb aa##bb cc public void test2blocksMultiMatches() throws IOException { final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", + new StringReader( BLOCK ) ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, @@ -150,8 +149,8 @@ // aa b - c . --- b aa . c c b public void testChain() throws IOException { final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".", - CharReader.get( new StringReader( BLOCK ) ) ); + CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".", + new StringReader( BLOCK ) ); cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (revision 1040997) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (working copy) @@ -21,9 +21,6 @@ import java.io.Reader; import java.util.LinkedList; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; - /** * Simplistic {@link CharFilter} that applies the mappings * contained in a {@link NormalizeCharMap} to the character @@ -38,15 +35,9 @@ private int charPointer; private int nextCharCounter; - /** Default constructor that takes a {@link CharStream}. */ - public MappingCharFilter(NormalizeCharMap normMap, CharStream in) { - super(in); - this.normMap = normMap; - } - /** Easy-use constructor that takes a {@link Reader}. */ public MappingCharFilter(NormalizeCharMap normMap, Reader in) { - super(CharReader.get(in)); + super(in); this.normMap = normMap; } @@ -83,7 +74,7 @@ if (buffer != null && !buffer.isEmpty()) { return buffer.removeFirst().charValue(); } - return input.read(); + return in.read(); } private void pushChar(int c) { @@ -123,7 +114,7 @@ @Override public int read(char[] cbuf, int off, int len) throws IOException { char[] tmp = new char[len]; - int l = input.read(tmp, 0, len); + int l = in.read(tmp, 0, len); if (l != -1) { for(int i = 0; i < l; i++) pushLastChar(tmp[i]); @@ -137,4 +128,12 @@ } return l == 0 ? -1 : l; } + + @Override + public void reset(Reader reader) { + super.reset(reader); + buffer.clear(); + replacement = null; + charPointer = nextCharCounter = 0; + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/CharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/CharFilter.java (revision 1040997) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/CharFilter.java (working copy) @@ -1,82 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis.charfilter; - -import java.io.IOException; - -import org.apache.lucene.analysis.CharStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Subclasses of CharFilter can be chained to filter CharStream. - * They can be used as {@link java.io.Reader} with additional offset - * correction. {@link Tokenizer}s will automatically use {@link #correctOffset} - * if a CharFilter/CharStream subclass is used. - */ -public abstract class CharFilter extends CharStream { - - protected CharStream input; - - protected CharFilter(CharStream in) { - input = in; - } - - /** - * Subclass may want to override to correct the current offset. - * - * @param currentOff current offset - * @return corrected offset - */ - protected int correct(int currentOff) { - return currentOff; - } - - /** - * Chains the corrected offset through the input - * CharFilter. - */ - @Override - public final int correctOffset(int currentOff) { - return input.correctOffset(correct(currentOff)); - } - - @Override - public void close() throws IOException { - input.close(); - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - return input.read(cbuf, off, len); - } - - @Override - public boolean markSupported(){ - return input.markSupported(); - } - - @Override - public void mark( int readAheadLimit ) throws IOException { - input.mark(readAheadLimit); - } - - @Override - public void reset() throws IOException { - input.reset(); - } -} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (revision 1040997) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (working copy) @@ -25,8 +25,6 @@ import java.util.Set; import org.apache.lucene.analysis.charfilter.BaseCharFilter; -import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; /** * A CharFilter that wraps another Reader and attempts to strip out HTML constructs. @@ -54,21 +52,21 @@ public static void main(String[] args) throws IOException { Reader in = new HTMLStripCharFilter( - CharReader.get(new InputStreamReader(System.in))); + new InputStreamReader(System.in)); int ch; while ( (ch=in.read()) != -1 ) System.out.print((char)ch); } - public HTMLStripCharFilter(CharStream source) { - super(source.markSupported() ? source : CharReader.get(new BufferedReader(source))); + public HTMLStripCharFilter(Reader source) { + super(source.markSupported() ? source : new BufferedReader(source)); } - public HTMLStripCharFilter(CharStream source, Set escapedTags){ + public HTMLStripCharFilter(Reader source, Set escapedTags){ this(source); this.escapedTags = escapedTags; } - public HTMLStripCharFilter(CharStream source, Set escapedTags, int readAheadLimit){ + public HTMLStripCharFilter(Reader source, Set escapedTags, int readAheadLimit){ this(source); this.escapedTags = escapedTags; this.readAheadLimit = readAheadLimit; @@ -87,7 +85,7 @@ return ch; } numRead++; - return input.read(); + return in.read(); } private int nextSkipWS() throws IOException { @@ -101,7 +99,7 @@ if (len>0) { return pushed.charAt(len-1); } - int ch = input.read(); + int ch = in.read(); push(ch); return ch; } @@ -163,11 +161,11 @@ private void saveState() throws IOException { lastMark = numRead; - input.mark(readAheadLimit); + in.mark(readAheadLimit); } private void restoreState() throws IOException { - input.reset(); + in.reset(); pushed.setLength(0); } @@ -754,11 +752,6 @@ return i; } - public void close() throws IOException { - input.close(); - } - - private static final HashMap entityTable; static { entityTable = new HashMap(); @@ -772,7 +765,14 @@ // special-case nbsp to a simple space instead of 0xa0 entityTable.put("nbsp",new Character(' ')); } - + + @Override + public void reset(Reader reader) { + super.reset(reader); + numWhitespace = numRead = numEaten = numReturned = lastMark = 0; + pushed.setLength(0); + sb.setLength(0); + } } /********************* htmlentity.py ********************** Index: modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (revision 1040997) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (working copy) @@ -17,7 +17,9 @@ package org.apache.lucene.analysis.charfilter; -import org.apache.lucene.analysis.CharStream; +import java.io.Reader; + +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.util.ArrayUtil; /** @@ -32,7 +34,7 @@ private int diffs[]; private int size = 0; - public BaseCharFilter(CharStream in) { + public BaseCharFilter(Reader in) { super(in); } @@ -83,4 +85,10 @@ offsets[size] = off; diffs[size++] = cumulativeDiff; } + + @Override + public void reset(Reader reader) { + super.reset(reader); + size = 0; + } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (revision 1040997) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (working copy) @@ -18,12 +18,12 @@ package org.apache.lucene.analysis.pattern; import java.io.IOException; +import java.io.Reader; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.charfilter.BaseCharFilter; -import org.apache.lucene.analysis.CharStream; /** * CharFilter that uses a regular expression for the target of replace string. @@ -62,22 +62,22 @@ private String replaceBlockBuffer; private int replaceBlockBufferOffset; - public PatternReplaceCharFilter( Pattern pattern, String replacement, CharStream in ){ + public PatternReplaceCharFilter( Pattern pattern, String replacement, Reader in ){ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in ); } public PatternReplaceCharFilter( Pattern pattern, String replacement, - int maxBlockChars, CharStream in ){ + int maxBlockChars, Reader in ){ this( pattern, replacement, maxBlockChars, null, in ); } public PatternReplaceCharFilter( Pattern pattern, String replacement, - String blockDelimiters, CharStream in ){ + String blockDelimiters, Reader in ){ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in ); } public PatternReplaceCharFilter( Pattern pattern, String replacement, - int maxBlockChars, String blockDelimiters, CharStream in ){ + int maxBlockChars, String blockDelimiters, Reader in ){ super( in ); this.pattern = pattern; this.replacement = replacement; @@ -122,7 +122,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { char[] tmp = new char[len]; - int l = input.read(tmp, 0, len); + int l = in.read(tmp, 0, len); if (l != -1) { for(int i = 0; i < l; i++) pushLastChar(tmp[i]); @@ -142,7 +142,7 @@ nextCharCounter++; return buffer.removeFirst().charValue(); } - int c = input.read(); + int c = in.read(); if( c != -1 ) nextCharCounter++; return c; @@ -189,4 +189,13 @@ m.appendTail( replaceBlock ); return replaceBlock.toString(); } + + @Override + public void reset(Reader reader) { + super.reset(reader); + buffer.clear(); + nextCharCounter = blockBufferLength = replaceBlockBufferOffset = 0; + blockBuffer = null; + replaceBlockBuffer = null; + } } Index: lucene/src/java/org/apache/lucene/analysis/CharReader.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/CharReader.java (revision 1040997) +++ lucene/src/java/org/apache/lucene/analysis/CharReader.java (working copy) @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis; - -import java.io.IOException; -import java.io.Reader; - -/** - * CharReader is a Reader wrapper. It reads chars from - * Reader and outputs {@link CharStream}, defining an - * identify function {@link #correctOffset} method that - * simply returns the provided offset. - */ -public final class CharReader extends CharStream { - - private final Reader input; - - public static CharStream get(Reader input) { - return input instanceof CharStream ? - (CharStream)input : new CharReader(input); - } - - private CharReader(Reader in) { - input = in; - } - - @Override - public int correctOffset(int currentOff) { - return currentOff; - } - - @Override - public void close() throws IOException { - input.close(); - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - return input.read(cbuf, off, len); - } - - @Override - public boolean markSupported(){ - return input.markSupported(); - } - - @Override - public void mark( int readAheadLimit ) throws IOException { - input.mark(readAheadLimit); - } - - @Override - public void reset() throws IOException { - input.reset(); - } -} Index: lucene/src/java/org/apache/lucene/analysis/CharFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/CharFilter.java (revision 0) +++ lucene/src/java/org/apache/lucene/analysis/CharFilter.java (revision 0) @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.FilterReader; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; + +/** + * Subclasses of CharFilter can be chained to filter a Reader + * They can be used as {@link java.io.Reader} with additional offset + * correction. {@link Tokenizer}s will automatically use {@link #correctOffset} + * if a CharFilter subclass is used. + */ +public abstract class CharFilter extends FilterReader { + public CharFilter(Reader in) { + super(in); + } + + /** + * Subclass may want to override to correct the current offset. + * + * @param currentOff current offset + * @return corrected offset + */ + protected int correct(int currentOff) { + return currentOff; + } + + /** + * Chains the corrected offset through the input + * CharFilter. + */ + public final int correctOffset(int currentOff) { + final int corrected = correct(currentOff); + return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected; + } + + /** Expert: Reset the CharFilter chain to a new reader. Typically, an + * analyzer (in its reusableTokenStream method) will use + * this to re-use a previously created CharFilter. */ + public void reset(Reader reader) { + // chain + if (in instanceof CharFilter) + ((CharFilter) in).reset(reader); + else + this.in = reader; + } +} Property changes on: lucene\src\java\org\apache\lucene\analysis\CharFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/analysis/CharStream.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/CharStream.java (revision 1040997) +++ lucene/src/java/org/apache/lucene/analysis/CharStream.java (working copy) @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis; - -import java.io.Reader; - -/** - * CharStream adds {@link #correctOffset} - * functionality over {@link Reader}. All Tokenizers accept a - * CharStream instead of {@link Reader} as input, which enables - * arbitrary character based filtering before tokenization. - * The {@link #correctOffset} method fixed offsets to account for - * removal or insertion of characters, so that the offsets - * reported in the tokens match the character offsets of the - * original Reader. - */ -public abstract class CharStream extends Reader { - - /** - * Called by CharFilter(s) and Tokenizer to correct token offset. - * - * @param currentOff offset as seen in the output - * @return corrected offset based on the input - */ - public abstract int correctOffset(int currentOff); -} Index: lucene/src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/Tokenizer.java (revision 1040997) +++ lucene/src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -39,7 +39,7 @@ /** Construct a token stream processing the given input. */ protected Tokenizer(Reader input) { - this.input = CharReader.get(input); + this.input = input; } /** Construct a tokenizer with null input using the given AttributeFactory. */ @@ -50,7 +50,7 @@ /** Construct a token stream processing the given input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory, Reader input) { super(factory); - this.input = CharReader.get(input); + this.input = input; } /** Construct a token stream processing the given input using the given AttributeSource. */ @@ -61,7 +61,7 @@ /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source, Reader input) { super(source); - this.input = CharReader.get(input); + this.input = input; } /** By default, closes the input Reader. */ @@ -82,7 +82,7 @@ * @see CharStream#correctOffset */ protected final int correctOffset(int currentOff) { - return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff; + return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff; } /** Expert: Reset the tokenizer to a new reader. Typically, an