Index: CHANGES.txt
===================================================================
--- CHANGES.txt	(revision 787231)
+++ CHANGES.txt	(working copy)
@@ -211,6 +211,11 @@
 23. LUCENE-1673: Deprecated NumberTools in favour of the new
     NumericRangeQuery and its new indexing format for numeric or
     date values.  (Uwe Schindler)
+
+24. LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
+    CharFilter and MappingCharFilter, which allows chaining & mapping
+    of characters before tokenizers run.  (Koji Sekiguchi via Mike
+    McCandless)
     
 Bug fixes
 
Index: src/test/org/apache/lucene/analysis/TestMappingCharFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestMappingCharFilter.java	(revision 0)
+++ src/test/org/apache/lucene/analysis/TestMappingCharFilter.java	(revision 0)
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.StringReader;
+import java.util.List;
+
+public class TestMappingCharFilter extends BaseTokenTestCase {
+
+  NormalizeCharMap normMap;
+
+  public void setUp() throws Exception {
+    super.setUp();
+    normMap = new NormalizeCharMap();
+
+    normMap.add( "aa", "a" );
+    normMap.add( "bbb", "b" );
+    normMap.add( "cccc", "cc" );
+
+    normMap.add( "h", "i" );
+    normMap.add( "j", "jj" );
+    normMap.add( "k", "kkk" );
+    normMap.add( "ll", "llll" );
+
+    normMap.add( "empty", "" );
+  }
+
+  public void testNothingChange() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "x" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "i" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to2() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "jj,1,0,1" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to3() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "kkk,1,0,1" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test2to4() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "llll,1,0,2" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test2to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "a,1,0,2" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test3to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "b,1,0,3" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test4to2() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "cc,1,0,4" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test5to0() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    assertEquals( 0, real.size() );
+  }
+
+  //
+  //                1111111111222
+  //      01234567890123456789012
+  //(in)  h i j k ll cccc bbb aa
+  //
+  //                1111111111222
+  //      01234567890123456789012
+  //(out) i i jj kkk llll cc b a
+  //
+  //    h, 0, 1 =>    i, 0, 1
+  //    i, 2, 3 =>    i, 2, 3
+  //    j, 4, 5 =>   jj, 4, 5
+  //    k, 6, 7 =>  kkk, 6, 7
+  //   ll, 8,10 => llll, 8,10
+  // cccc,11,15 =>   cc,11,15
+  //  bbb,16,19 =>    b,16,19
+  //   aa,20,22 =>    a,20,22
+  //
+  public void testTokenStream() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
+    assertTokEqualOff( expect, real );
+  }
+
+  //
+  //
+  //        0123456789
+  //(in)    aaaa ll h
+  //(out-1) aa llll i
+  //(out-2) a llllllll i
+  //
+  // aaaa,0,4 => a,0,4
+  //   ll,5,7 => llllllll,5,7
+  //    h,8,9 => i,8,9
+  public void testChained() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap,
+        new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
+    TokenStream ts = new WhitespaceTokenizer( cs );
+    List real = getTokens( ts );
+    List expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
+    assertTokEqualOff( expect, real );
+  }
+}
Index: src/test/org/apache/lucene/analysis/BaseTokenTestCase.java
===================================================================
--- src/test/org/apache/lucene/analysis/BaseTokenTestCase.java	(revision 0)
+++ src/test/org/apache/lucene/analysis/BaseTokenTestCase.java	(revision 0)
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public abstract class BaseTokenTestCase extends LuceneTestCase {
+  public static String tsToString(TokenStream in) throws IOException {
+    StringBuilder out = new StringBuilder();
+    Token t = in.next();
+    if (null != t)
+      out.append(new String(t.termBuffer(), 0, t.termLength()));
+    
+    for (t = in.next(); null != t; t = in.next()) {
+      out.append(" ").append(new String(t.termBuffer(), 0, t.termLength()));
+    }
+    in.close();
+    return out.toString();
+  }
+/*
+  public List<String> tok2str(Iterable<Token> tokLst) {
+    ArrayList<String> lst = new ArrayList<String>();
+    for ( Token t : tokLst ) {
+      lst.add( new String(t.termBuffer(), 0, t.termLength()));
+    }
+    return lst;
+  }
+*/
+
+  public void assertTokEqual(List/*<Token>*/ a, List/*<Token>*/ b) {
+    assertTokEq(a,b,false);
+    assertTokEq(b,a,false);
+  }
+
+  public void assertTokEqualOff(List/*<Token>*/ a, List/*<Token>*/ b) {
+    assertTokEq(a,b,true);
+    assertTokEq(b,a,true);
+  }
+
+  private void assertTokEq(List/*<Token>*/ a, List/*<Token>*/ b, boolean checkOff) {
+    int pos=0;
+    for (Iterator iter = a.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
+              , checkOff ? tok.startOffset() : -1
+              , checkOff ? tok.endOffset() : -1
+              )) 
+      {
+        fail(a + "!=" + b);
+      }
+    }
+  }
+
+  public boolean tokAt(List/*<Token>*/ lst, String val, int tokPos, int startOff, int endOff) {
+    int pos=0;
+    for (Iterator iter = lst.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val)
+          && (startOff==-1 || tok.startOffset()==startOff)
+          && (endOff  ==-1 || tok.endOffset()  ==endOff  )
+           )
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+
+  /***
+   * Return a list of tokens according to a test string format:
+   * a b c  =>  returns List<Token> [a,b,c]
+   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
+   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
+   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
+   */
+  public List/*<Token>*/ tokens(String str) {
+    String[] arr = str.split(" ");
+    List/*<Token>*/ result = new ArrayList/*<Token>*/();
+    for (int i=0; i<arr.length; i++) {
+      String[] toks = arr[i].split("/");
+      String[] params = toks[0].split(",");
+
+      int posInc;
+      int start;
+      int end;
+
+      if (params.length > 1) {
+        posInc = Integer.parseInt(params[1]);
+      } else {
+        posInc = 1;
+      }
+
+      if (params.length > 2) {
+        start = Integer.parseInt(params[2]);
+      } else {
+        start = 0;
+      }
+
+      if (params.length > 3) {
+        end = Integer.parseInt(params[3]);
+      } else {
+        end = start + params[0].length();
+      }
+
+      Token t = new Token(params[0],start,end,"TEST");
+      t.setPositionIncrement(posInc);
+      
+      result.add(t);
+      for (int j=1; j<toks.length; j++) {
+        t = new Token(toks[j],0,0,"TEST");
+        t.setPositionIncrement(0);
+        result.add(t);
+      }
+    }
+    return result;
+  }
+
+  //------------------------------------------------------------------------
+  // These may be useful beyond test cases...
+  //------------------------------------------------------------------------
+
+  static List/*<Token>*/ getTokens(TokenStream tstream) throws IOException {
+    List/*<Token>*/ tokens = new ArrayList/*<Token>*/();
+    while (true) {
+      Token t = tstream.next();
+      if (t==null) break;
+      tokens.add(t);
+    }
+    return tokens;
+  }
+/*
+  public static class IterTokenStream extends TokenStream {
+    Iterator<Token> toks;
+    public IterTokenStream(Token... toks) {
+      this.toks = Arrays.asList(toks).iterator();
+    }
+    public IterTokenStream(Iterable<Token> toks) {
+      this.toks = toks.iterator();
+    }
+    public IterTokenStream(Iterator<Token> toks) {
+      this.toks = toks;
+    }
+    public IterTokenStream(String ... text) {
+      int off = 0;
+      ArrayList<Token> t = new ArrayList<Token>( text.length );
+      for( String txt : text ) {
+        t.add( new Token( txt, off, off+txt.length() ) );
+        off += txt.length() + 2;
+      }
+      this.toks = t.iterator();
+    }
+    @Override
+    public Token next() {
+      if (toks.hasNext()) {
+        return toks.next();
+      }
+      return null;
+    }
+  }
+*/
+}
Index: src/java/org/apache/lucene/analysis/MappingCharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/MappingCharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/MappingCharFilter.java	(revision 0)
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ * {@link CharFilter} that applies the mappings contained in
+ * a {@link NormalizeCharMap} to the character stream.
+ *
+ * @version $Id$
+ */
+public class MappingCharFilter extends BaseCharFilter {
+
+  private final NormalizeCharMap normMap;
+  //private LinkedList<Character> buffer;
+  private LinkedList buffer;
+  private String replacement;
+  private int charPointer;
+  private int nextCharCounter;
+
+  public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
+    super(in);
+    this.normMap = normMap;
+  }
+
+  public int read() throws IOException {
+    while(true) {
+      if (replacement != null && charPointer < replacement.length()) {
+        return replacement.charAt(charPointer++);
+      }
+
+      int firstChar = nextChar();
+      if (firstChar == -1) return -1;
+      NormalizeCharMap nm = normMap.submap != null ?
+        (NormalizeCharMap)normMap.submap.get(Character.valueOf((char) firstChar)) : null;
+      if (nm == null) return firstChar;
+      NormalizeCharMap result = match(nm);
+      if (result == null) return firstChar;
+      replacement = result.normStr;
+      charPointer = 0;
+      if (result.diff != 0) {
+        int prevCumulativeDiff = getLastCumulativeDiff();
+        if (result.diff < 0) {
+          for(int i = 0; i < -result.diff ; i++)
+            addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
+        } else {
+          addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
+        }
+      }
+    }
+  }
+
+  private int nextChar() throws IOException {
+    nextCharCounter++;
+    if (buffer != null && !buffer.isEmpty()) {
+      return ((Character)buffer.removeFirst()).charValue();
+    }
+    return input.read();
+  }
+
+  private void pushChar(int c) {
+    nextCharCounter--;
+    if(buffer == null)
+      buffer = new LinkedList();
+    buffer.addFirst(new Character((char) c));
+  }
+
+  private void pushLastChar(int c) {
+    if (buffer == null) {
+      buffer = new LinkedList();
+    }
+    buffer.addLast(new Character((char) c));
+  }
+
+  private NormalizeCharMap match(NormalizeCharMap map) throws IOException {
+    NormalizeCharMap result = null;
+    if (map.submap != null) {
+      int chr = nextChar();
+      if (chr != -1) {
+        NormalizeCharMap subMap = (NormalizeCharMap) map.submap.get(Character.valueOf((char) chr));
+        if (subMap != null) {
+          result = match(subMap);
+        }
+        if (result == null) {
+          pushChar(chr);
+        }
+      }
+    }
+    if (result == null && map.normStr != null) {
+      result = map;
+    }
+    return result;
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    char[] tmp = new char[len];
+    int l = input.read(tmp, 0, len);
+    if (l != -1) {
+      for(int i = 0; i < l; i++)
+        pushLastChar(tmp[i]);
+    }
+    l = 0;
+    for(int i = off; i < off + len; i++) {
+      int c = read();
+      if (c == -1) break;
+      cbuf[i] = (char) c;
+      l++;
+    }
+    return l == 0 ? -1 : l;
+  }
+
+  public boolean markSupported() {
+    return false;
+  }
+
+  public void mark(int readAheadLimit) throws IOException {
+    throw new IOException("mark/reset not supported");
+  }
+
+  public void reset() throws IOException {
+    throw new IOException("mark/reset not supported");
+  }
+}
Index: src/java/org/apache/lucene/analysis/CharReader.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharReader.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharReader.java	(revision 0)
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public final class CharReader extends CharStream {
+
+  protected Reader input;
+  
+  public static CharStream get(Reader input) {
+    return input instanceof CharStream ?
+      (CharStream)input : new CharReader(input);
+  }
+
+  private CharReader(Reader in) {
+    input = in;
+  }
+
+  public int correctOffset(int currentOff) {
+    return currentOff;
+  }
+
+  public void close() throws IOException {
+    input.close();
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len);
+  }
+}
Index: src/java/org/apache/lucene/analysis/BaseCharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/BaseCharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/BaseCharFilter.java	(revision 0)
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Base utility class for implementing a {@link
+ * CharFilter}.  You record mappings by calling {@link
+ * #addOffCorrectMap}, and then invoke the correct method.
+ * @version $Id$
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+  //private List<OffCorrectMap> pcmList;
+  private List pcmList;
+  
+  public BaseCharFilter(CharStream in) {
+    super(in);
+  }
+
+  /** Retrieve the corrected offset.  Note that this method
+   *  is slow if you correct positions far before the most
+   *  recently added position. */
+  protected int correct(int currentOff) {
+    if (pcmList == null || pcmList.isEmpty()) {
+      return currentOff;
+    }
+    for (int i = pcmList.size() - 1; i >= 0; i--) {
+      if (currentOff >= ((OffCorrectMap) pcmList.get(i)).off) {
+        return currentOff + ((OffCorrectMap) pcmList.get(i)).cumulativeDiff;
+      }
+    }
+    return currentOff;
+  }
+  
+  protected int getLastCumulativeDiff() {
+    return pcmList == null || pcmList.isEmpty() ?
+      0 : ((OffCorrectMap)pcmList.get(pcmList.size() - 1)).cumulativeDiff;
+  }
+
+  protected void addOffCorrectMap(int off, int cumulativeDiff) {
+    if (pcmList == null) pcmList = new ArrayList();
+    pcmList.add(new OffCorrectMap(off, cumulativeDiff));
+  }
+
+  static class OffCorrectMap {
+
+    int off;
+    int cumulativeDiff;
+
+    OffCorrectMap(int off, int cumulativeDiff) {
+      this.off = off;
+      this.cumulativeDiff = cumulativeDiff;
+    }
+
+    public String toString() {
+      StringBuffer sb = new StringBuffer();
+      sb.append('(');
+      sb.append(off);
+      sb.append(',');
+      sb.append(cumulativeDiff);
+      sb.append(')');
+      return sb.toString();
+    }
+  }
+}
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java	(revision 787231)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java	(working copy)
@@ -59,7 +59,7 @@
           buffer = termAtt.resizeTermBuffer(1+buffer.length);
       }
       termAtt.setTermLength(upto);
-      offsetAtt.setOffset(0, upto);
+      offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
       return true;
     }
     return false;
@@ -81,8 +81,8 @@
           buffer = reusableToken.resizeTermBuffer(1+buffer.length);
       }
       reusableToken.setTermLength(upto);
-      reusableToken.setStartOffset(0);
-      reusableToken.setEndOffset(upto);
+      reusableToken.setStartOffset(input.correctOffset(0));
+      reusableToken.setEndOffset(input.correctOffset(upto));
       
       return reusableToken;
     }
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java	(revision 787231)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java	(working copy)
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.io.Reader;
 
+import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -91,7 +92,7 @@
   private boolean replaceInvalidAcronym;
     
   void setInput(Reader reader) {
-    this.input = reader;
+    input = CharReader.get(reader);
   }
 
   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@@ -126,7 +127,7 @@
    */
   public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
     this.replaceInvalidAcronym = replaceInvalidAcronym;
-    this.input = input;
+    setInput(input);
     this.scanner = new StandardTokenizerImpl(input);
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
@@ -161,7 +162,7 @@
         posIncrAtt.setPositionIncrement(posIncr);
         scanner.getText(termAtt);
         final int start = scanner.yychar();
-        offsetAtt.setOffset(start, start+termAtt.termLength());
+        offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
         // This 'if' should be removed in the next release. For now, it converts
         // invalid acronyms to HOST. When removed, only the 'else' part should
         // remain.
@@ -194,19 +195,19 @@
       int posIncr = 1;
 
       while(true) {
-	int tokenType = scanner.getNextToken();
+        int tokenType = scanner.getNextToken();
 
-	if (tokenType == StandardTokenizerImpl.YYEOF) {
-	    return null;
-	}
+        if (tokenType == StandardTokenizerImpl.YYEOF) {
+          return null;
+        }
 
         if (scanner.yylength() <= maxTokenLength) {
           reusableToken.clear();
           reusableToken.setPositionIncrement(posIncr);
           scanner.getText(reusableToken);
           final int start = scanner.yychar();
-          reusableToken.setStartOffset(start);
-          reusableToken.setEndOffset(start+reusableToken.termLength());
+          reusableToken.setStartOffset(input.correctOffset(start));
+          reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
           // This 'if' should be removed in the next release. For now, it converts
           // invalid acronyms to HOST. When removed, only the 'else' part should
           // remain.
@@ -234,13 +235,13 @@
      * @see org.apache.lucene.analysis.TokenStream#reset()
      */
     public void reset() throws IOException {
-	super.reset();
-	scanner.yyreset(input);
+      super.reset();
+      scanner.yyreset(input);
     }
 
     public void reset(Reader reader) throws IOException {
-        input = reader;
-        reset();
+      setInput(reader);
+      reset();
     }
 
   /**
Index: src/java/org/apache/lucene/analysis/CharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharFilter.java	(revision 0)
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+
+/**
+ * Subclasses of CharFilter can be chained to filter CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public abstract class CharFilter extends CharStream {
+
+  protected CharStream input;
+
+  protected CharFilter(CharStream in) {
+    input = in;
+  }
+
+  /**
+   * Subclass may want to override to correct the current offset.
+   *
+   * @param currentOff current offset
+   * @return corrected offset
+   */
+  protected int correct(int currentOff) {
+    return currentOff;
+  }
+
+  /**
+   * Chains the corrected offset through the input
+   * CharFilter.
+   */
+  public final int correctOffset(int currentOff) {
+    return input.correctOffset(correct(currentOff));
+  }
+
+  public void close() throws IOException {
+    input.close();
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len);
+  }
+}
Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharTokenizer.java	(revision 787231)
+++ src/java/org/apache/lucene/analysis/CharTokenizer.java	(working copy)
@@ -90,7 +90,7 @@
     }
 
     termAtt.setTermLength(length);
-    offsetAtt.setOffset(start, start+length);
+    offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
     return true;
   }
 
@@ -134,8 +134,8 @@
     }
 
     reusableToken.setTermLength(length);
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(start+length);
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(start+length));
     return reusableToken;
   }
 
Index: src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/Tokenizer.java	(revision 787231)
+++ src/java/org/apache/lucene/analysis/Tokenizer.java	(working copy)
@@ -45,16 +45,20 @@
 
 public abstract class Tokenizer extends TokenStream {
   /** The text source for this Tokenizer. */
-  protected Reader input;
+  protected CharStream input;
 
   /** Construct a tokenizer with null input. */
   protected Tokenizer() {}
 
   /** Construct a token stream processing the given input. */
   protected Tokenizer(Reader input) {
+    this.input = CharReader.get(input);
+  }
+
+  protected Tokenizer(CharStream input) {
     this.input = input;
   }
-
+  
   /** By default, closes the input Reader. */
   public void close() throws IOException {
     input.close();
@@ -64,6 +68,10 @@
    *  analyzer (in its reusableTokenStream method) will use
    *  this to re-use a previously created tokenizer. */
   public void reset(Reader input) throws IOException {
+    this.input = CharReader.get(input);
+  }
+
+  public void reset(CharStream input) throws IOException {
     this.input = input;
   }
 }
Index: src/java/org/apache/lucene/analysis/CharStream.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharStream.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharStream.java	(revision 0)
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.Reader;
+
+/**
+ * CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
+ *
+ * @version $Id$
+ */
+public abstract class CharStream extends Reader {
+
+  /**
+   * Called by CharFilter(s) and Tokenizer to correct token offset.
+   *
+   * @param currentOff current offset
+   * @return corrected token offset
+   */
+  public abstract int correctOffset(int currentOff);
+}
Index: src/java/org/apache/lucene/analysis/NormalizeCharMap.java
===================================================================
--- src/java/org/apache/lucene/analysis/NormalizeCharMap.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/NormalizeCharMap.java	(revision 0)
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Holds a map of String input to String output, to be used
+ * with {@link MappingCharFilter}.
+ * @version $Id$
+ */
+public class NormalizeCharMap {
+
+  //Map<Character, NormalizeMap> submap;
+  Map submap;
+  String normStr;
+  int diff;
+
+  public void add(String singleMatch, String replacement) {
+    NormalizeCharMap currMap = this;
+    for(int i = 0; i < singleMatch.length(); i++) {
+      char c = singleMatch.charAt(i);
+      if (currMap.submap == null) {
+        currMap.submap = new HashMap(1);
+      }
+      NormalizeCharMap map = (NormalizeCharMap) currMap.submap.get(Character.valueOf(c));
+      if (map == null) {
+        map = new NormalizeCharMap();
+        currMap.submap.put(new Character(c), map);
+      }
+      currMap = map;
+    }
+    if (currMap.normStr != null) {
+      throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch);
+    }
+    currMap.normStr = replacement;
+    currMap.diff = singleMatch.length() - replacement.length();
+  }
+}
Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
===================================================================
--- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java	(revision 787231)
+++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java	(working copy)
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.wikipedia.analysis;
 
+import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 
@@ -107,7 +108,7 @@
   private Iterator tokens = null;
 
   void setInput(Reader reader) {
-    this.input = reader;
+    this.input = CharReader.get(reader);
   }
 
   /**
@@ -190,8 +191,8 @@
     //trim the buffer
     String s = buffer.toString().trim();
     reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
-    reusableToken.setStartOffset(theStart);
-    reusableToken.setEndOffset(theStart + s.length());
+    reusableToken.setStartOffset(input.correctOffset(theStart));
+    reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
     reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -229,8 +230,8 @@
     //trim the buffer
     String s = buffer.toString().trim();
     reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
-    reusableToken.setStartOffset(theStart);
-    reusableToken.setEndOffset(theStart + s.length());
+    reusableToken.setStartOffset(input.correctOffset(theStart));
+    reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
     reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -243,8 +244,8 @@
   private void setupToken(final Token reusableToken) {
     scanner.getText(reusableToken);
     final int start = scanner.yychar();
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(start + reusableToken.termLength());
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
   }
 
   /*
@@ -258,7 +259,7 @@
   }
 
   public void reset(Reader reader) throws IOException {
-    input = reader;
+    setInput(reader);
     reset();
   }
 
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java	(revision 787231)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java	(working copy)
@@ -45,6 +45,7 @@
   private Token t = new Token();
 
   public SentenceTokenizer(Reader reader) {
+    super(reader);
     bufferInput = new BufferedReader(reader, 2048);
   }
 
@@ -91,7 +92,7 @@
       return null;
     else {
       t.clear();
-      t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
+      t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
       return t;
     }
   }
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java	(revision 787231)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java	(working copy)
@@ -55,7 +55,7 @@
 
 
     public ChineseTokenizer(Reader in) {
-        input = in;
+      super(in);
     }
 
     private int offset = 0, bufferIndex=0, dataLen=0;
@@ -81,7 +81,7 @@
         if (length>0) {
             //System.out.println(new String(buffer, 0,
             //length));
-          return token.reinit(buffer, 0, length, start, start+length);
+          return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
         }
         else
             return null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(revision 787231)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(working copy)
@@ -85,6 +85,6 @@
 
     int oldPos = pos;
     pos++;
-    return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
+    return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
   }
 }
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java	(revision 787231)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java	(working copy)
@@ -140,8 +140,8 @@
     int start = side == Side.FRONT ? 0 : inLen - gramSize;
     int end = start + gramSize;
     reusableToken.setTermBuffer(inStr, start, gramSize);
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(end);
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(end));
     gramSize++;
     return reusableToken;
   }
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java	(revision 787231)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java	(working copy)
@@ -96,7 +96,7 @@
      * @param in I/O reader
      */
     public CJKTokenizer(Reader in) {
-        input = in;
+      super(in);
     }
 
     //~ Methods ----------------------------------------------------------------
@@ -253,7 +253,7 @@
       
         if (length > 0) {
             return reusableToken.reinit
-                (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
+                (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]);
         } else if (dataLen == -1) {
           return null;
         }
