Index: src/test/org/apache/lucene/analysis/TestCharFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestCharFilter.java	(revision 0)
+++ src/test/org/apache/lucene/analysis/TestCharFilter.java	(revision 0)
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+public class TestCharFilter extends TestCase {
+
+  public void testCharFilter1() throws Exception {
+    CharStream cs = new CharFilter1( new CharReader( new StringReader("") ) );
+    assertEquals( "corrected offset is invalid", 1, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter2() throws Exception {
+    CharStream cs = new CharFilter2( new CharReader( new StringReader("") ) );
+    assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter12() throws Exception {
+    CharStream cs = new CharFilter2( new CharFilter1( new CharReader( new StringReader("") ) ) );
+    assertEquals( "corrected offset is invalid", 3, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter11() throws Exception {
+    CharStream cs = new CharFilter1( new CharFilter1( new CharReader( new StringReader("") ) ) );
+    assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) );
+  }
+
+  static class CharFilter1 extends CharFilter {
+
+    protected CharFilter1(CharStream in) {
+      super(in);
+    }
+
+    protected int correct(int currentOff) {
+      return currentOff + 1;
+    }
+  }
+
+  static class CharFilter2 extends CharFilter {
+
+    protected CharFilter2(CharStream in) {
+      super(in);
+    }
+
+    protected int correct(int currentOff) {
+      return currentOff + 2;
+    }
+  }
+}
Index: src/java/org/apache/lucene/analysis/MappingCharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/MappingCharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/MappingCharFilter.java	(revision 0)
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ *
+ * @version $Id$
+ *
+ */
+public class MappingCharFilter extends BaseCharFilter {
+
+  private final NormalizeMap normMap;
+  //private LinkedList<Character> buffer;
+  private LinkedList buffer;
+  private String replacement;
+  private int charPointer;
+  private int nextCharCounter;
+
+  public MappingCharFilter( NormalizeMap normMap, CharStream in ){
+    super( in );
+    this.normMap = normMap;
+  }
+
+  public int read() throws IOException {
+    while( true ){
+      if( replacement != null && charPointer < replacement.length() )
+        return replacement.charAt( charPointer++ );
+
+      int firstChar = nextChar();
+      if( firstChar == -1 ) return -1;
+      NormalizeMap nm = normMap.submap != null ?
+        (NormalizeMap)normMap.submap.get( Character.valueOf( (char)firstChar ) ) : null;
+      if( nm == null ) return firstChar;
+      NormalizeMap result = match( nm );
+      if( result == null ) return firstChar;
+      replacement = result.normStr;
+      charPointer = 0;
+      if( result.diff != 0 ){
+        int prevCumulativeDiff = getLastCumulativeDiff();
+        if( result.diff < 0 ){
+          for( int i = 0; i < -result.diff ; i++ )
+            addOffCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i );
+        }
+        else{
+          addOffCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) ;
+        }
+      }
+    }
+  }
+
+  private int nextChar() throws IOException {
+    nextCharCounter++;
+    if( buffer != null && !buffer.isEmpty() )
+      return ((Character)buffer.removeFirst()).charValue();
+    return input.read();
+  }
+
+  private void pushChar( int c ){
+    nextCharCounter--;
+    if( buffer == null )
+      buffer = new LinkedList();
+    buffer.addFirst( new Character( (char)c ) );
+  }
+
+  private void pushLastChar( int c ){
+    if( buffer == null )
+      buffer = new LinkedList();
+    buffer.addLast( new Character( (char)c ) );
+  }
+
+  private NormalizeMap match( NormalizeMap map ) throws IOException {
+    NormalizeMap result = null;
+    if( map.submap != null ){
+      int chr = nextChar();
+      if( chr != -1 ){
+        NormalizeMap subMap = (NormalizeMap)map.submap.get( Character.valueOf( (char)chr ) );
+        if( subMap != null ){
+          result = match( subMap );
+        }
+        if( result == null )
+          pushChar( chr );
+      }
+    }
+    if( result == null && map.normStr != null )
+      result = map;
+    return result;
+  }
+
+  public int read( char[] cbuf, int off, int len ) throws IOException {
+    char[] tmp = new char[len];
+    int l = input.read( tmp, 0, len );
+    if( l != -1 ){
+      for( int i = 0; i < l; i++ )
+        pushLastChar( tmp[i] );
+    }
+    l = 0;
+    for( int i = off; i < off + len; i++ ){
+      int c = read();
+      if( c == -1 ) break;
+      cbuf[i] = (char)c;
+      l++;
+    }
+    return l == 0 ? -1 : l;
+  }
+
+  public boolean markSupported(){
+    return false;
+  }
+
+  public void mark( int readAheadLimit ) throws IOException {
+    throw new IOException( "mark/reset not supported" );
+  }
+
+  public void reset() throws IOException {
+    throw new IOException( "mark/reset not supported" );
+  }
+}
Index: src/java/org/apache/lucene/analysis/CharReader.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharReader.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharReader.java	(revision 0)
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public final class CharReader extends CharStream {
+
+  protected Reader input;
+
+  public CharReader( Reader in ){
+    input = in;
+  }
+
+  public int correctOffset(int currentOff) {
+    return currentOff;
+  }
+
+  public void close() throws IOException {
+    input.close();
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len );
+  }
+}
Index: src/java/org/apache/lucene/analysis/BaseCharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/BaseCharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/BaseCharFilter.java	(revision 0)
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ *
+ * @version $Id$
+ *
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+  //private List<OffCorrectMap> pcmList;
+  private List pcmList;
+  
+  public BaseCharFilter( CharStream in ){
+    super(in);
+  }
+
+  protected int correct( int currentOff ){
+    if( pcmList == null || pcmList.isEmpty() ) return currentOff;
+    for( int i = pcmList.size() - 1; i >= 0; i-- ){
+      if( currentOff >= ((OffCorrectMap)pcmList.get( i )).off )
+        return currentOff + ((OffCorrectMap)pcmList.get( i )).cumulativeDiff;
+    }
+    return currentOff;
+  }
+  
+  protected int getLastCumulativeDiff(){
+    return pcmList == null || pcmList.isEmpty() ?
+      0 : ((OffCorrectMap)pcmList.get( pcmList.size() - 1 )).cumulativeDiff;
+  }
+  
+  protected void addOffCorrectMap( int off, int cumulativeDiff ){
+    if( pcmList == null ) pcmList = new ArrayList();
+    pcmList.add( new OffCorrectMap( off, cumulativeDiff ) );
+  }
+
+  static class OffCorrectMap {
+
+    int off;
+    int cumulativeDiff;
+
+    OffCorrectMap( int off, int cumulativeDiff ){
+      this.off = off;
+      this.cumulativeDiff = cumulativeDiff;
+    }
+
+    public String toString(){
+      StringBuffer sb = new StringBuffer();
+      sb.append('(');
+      sb.append(off);
+      sb.append(',');
+      sb.append(cumulativeDiff);
+      sb.append(')');
+      return sb.toString();
+    }
+  }
+}
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java	(revision 755894)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java	(working copy)
@@ -59,8 +59,8 @@
           buffer = termAtt.resizeTermBuffer(1+buffer.length);
       }
       termAtt.setTermLength(upto);
-      offsetAtt.setStartOffset(0);
-      offsetAtt.setEndOffset(upto);
+      offsetAtt.setStartOffset(input.correctOffset(0));
+      offsetAtt.setEndOffset(input.correctOffset(upto));
       return true;
     }
     return false;
@@ -82,8 +82,8 @@
           buffer = reusableToken.resizeTermBuffer(1+buffer.length);
       }
       reusableToken.setTermLength(upto);
-      reusableToken.setStartOffset(0);
-      reusableToken.setEndOffset(upto);
+      reusableToken.setStartOffset(input.correctOffset(0));
+      reusableToken.setEndOffset(input.correctOffset(upto));
       
       return reusableToken;
     }
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java	(revision 755894)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java	(working copy)
@@ -20,6 +20,8 @@
 import java.io.IOException;
 import java.io.Reader;
 
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -91,7 +93,10 @@
   private boolean replaceInvalidAcronym;
     
   void setInput(Reader reader) {
-    this.input = reader;
+    if( reader instanceof CharStream )
+      this.input = (CharStream)reader;
+    else
+      this.input = new CharReader(reader);
   }
 
   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@@ -126,7 +131,7 @@
    */
   public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
     this.replaceInvalidAcronym = replaceInvalidAcronym;
-    this.input = input;
+    setInput( input );
     this.scanner = new StandardTokenizerImpl(input);
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
@@ -161,8 +166,8 @@
         posIncrAtt.setPositionIncrement(posIncr);
         scanner.getText(termAtt);
         final int start = scanner.yychar();
-        offsetAtt.setStartOffset(start);
-        offsetAtt.setEndOffset(start+termAtt.termLength());
+        offsetAtt.setStartOffset(input.correctOffset(start));
+        offsetAtt.setEndOffset(input.correctOffset(start+termAtt.termLength()));
         // This 'if' should be removed in the next release. For now, it converts
         // invalid acronyms to HOST. When removed, only the 'else' part should
         // remain.
@@ -195,19 +200,19 @@
       int posIncr = 1;
 
       while(true) {
-	int tokenType = scanner.getNextToken();
+        int tokenType = scanner.getNextToken();
 
-	if (tokenType == StandardTokenizerImpl.YYEOF) {
-	    return null;
-	}
+        if (tokenType == StandardTokenizerImpl.YYEOF) {
+          return null;
+        }
 
         if (scanner.yylength() <= maxTokenLength) {
           reusableToken.clear();
           reusableToken.setPositionIncrement(posIncr);
           scanner.getText(reusableToken);
           final int start = scanner.yychar();
-          reusableToken.setStartOffset(start);
-          reusableToken.setEndOffset(start+reusableToken.termLength());
+          reusableToken.setStartOffset(input.correctOffset(start));
+          reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
           // This 'if' should be removed in the next release. For now, it converts
           // invalid acronyms to HOST. When removed, only the 'else' part should
           // remain.
@@ -235,13 +240,13 @@
      * @see org.apache.lucene.analysis.TokenStream#reset()
      */
     public void reset() throws IOException {
-	super.reset();
-	scanner.yyreset(input);
+      super.reset();
+      scanner.yyreset(input);
     }
 
     public void reset(Reader reader) throws IOException {
-        input = reader;
-        reset();
+      setInput( reader );
+      reset();
     }
 
   /**
Index: src/java/org/apache/lucene/analysis/CharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharFilter.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharFilter.java	(revision 0)
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+
+/**
+ *
+ * Subclasses of CharFilter can be chained to filter CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public abstract class CharFilter extends CharStream {
+
+  protected CharStream input;
+
+  protected CharFilter( CharStream in ){
+    input = in;
+  }
+
+  /**
+   *
+   * Subclass may want to override to correct the current offset.
+   *
+   * @param currentOff current offset
+   * @return corrected offset
+   */
+  protected int correct( int currentOff ){
+    return currentOff;
+  }
+
+  public final int correctOffset(int currentOff) {
+    return input.correctOffset( correct( currentOff ) );
+  }
+
+  public void close() throws IOException {
+    input.close();
+  }
+
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len);
+  }
+}
Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharTokenizer.java	(revision 755894)
+++ src/java/org/apache/lucene/analysis/CharTokenizer.java	(working copy)
@@ -90,8 +90,8 @@
     }
 
     termAtt.setTermLength(length);
-    offsetAtt.setStartOffset(start);
-    offsetAtt.setEndOffset(start+length);
+    offsetAtt.setStartOffset(input.correctOffset(start));
+    offsetAtt.setEndOffset(input.correctOffset(start+length));
     return true;
   }
 
@@ -135,8 +135,8 @@
     }
 
     reusableToken.setTermLength(length);
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(start+length);
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(start+length));
     return reusableToken;
   }
 
Index: src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/Tokenizer.java	(revision 755894)
+++ src/java/org/apache/lucene/analysis/Tokenizer.java	(working copy)
@@ -45,16 +45,24 @@
 
 public abstract class Tokenizer extends TokenStream {
   /** The text source for this Tokenizer. */
-  protected Reader input;
+  protected CharStream input;
 
   /** Construct a tokenizer with null input. */
   protected Tokenizer() {}
 
   /** Construct a token stream processing the given input. */
   protected Tokenizer(Reader input) {
+    if (input instanceof CharStream) {
+      this.input = (CharStream)input;
+    } else {
+      this.input = new CharReader(input);
+    }
+  }
+
+  protected Tokenizer(CharStream input) {
     this.input = input;
   }
-
+  
   /** By default, closes the input Reader. */
   public void close() throws IOException {
     input.close();
@@ -64,7 +72,11 @@
    *  analyzer (in its reusableTokenStream method) will use
    *  this to re-use a previously created tokenizer. */
   public void reset(Reader input) throws IOException {
-    this.input = input;
+    if (input instanceof CharStream) {
+      this.input = (CharStream)input;
+    } else {
+      this.input = new CharReader(input);
+    }
   }
 }
 
Index: src/java/org/apache/lucene/analysis/CharStream.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharStream.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/CharStream.java	(revision 0)
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.Reader;
+
+/**
+ * CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
+ *
+ * @version $Id$
+ *
+ */
+public abstract class CharStream extends Reader {
+
+  /**
+   * called by CharFilter(s) and Tokenizer to correct token offset.
+   *
+   * @param currentOff current offset
+   * @return corrected token offset
+   */
+  public abstract int correctOffset( int currentOff );
+}
Index: src/java/org/apache/lucene/analysis/NormalizeMap.java
===================================================================
--- src/java/org/apache/lucene/analysis/NormalizeMap.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/NormalizeMap.java	(revision 0)
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ * @version $Id$
+ *
+ */
+public class NormalizeMap {
+
+  //Map<Character, NormalizeMap> submap;
+  Map submap;
+  String normStr;
+  int diff;
+
+  public void add( String singleMatch, String replacement ){
+    NormalizeMap currMap = this;
+    for( int i = 0; i < singleMatch.length(); i++ ){
+      char c = singleMatch.charAt( i );
+      if( currMap.submap == null ){
+        currMap.submap = new HashMap( 1 );
+      }
+      NormalizeMap map = (NormalizeMap)currMap.submap.get( Character.valueOf( c ) );
+      if( map == null ){
+        map = new NormalizeMap();
+        currMap.submap.put( new Character( c ), map );
+      }
+      currMap = map;
+    }
+    if( currMap.normStr != null ){
+      throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
+    }
+    currMap.normStr = replacement;
+    currMap.diff = singleMatch.length() - replacement.length();
+  }
+}
Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
===================================================================
--- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java	(revision 755894)
+++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java	(working copy)
@@ -17,6 +17,8 @@
 
 package org.apache.lucene.wikipedia.analysis;
 
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
 
@@ -107,7 +109,10 @@
   private Iterator tokens = null;
 
   void setInput(Reader reader) {
-    this.input = reader;
+    if( reader instanceof CharStream )
+      this.input = (CharStream)reader;
+    else
+      this.input = new CharReader(reader);
   }
 
   /**
@@ -190,8 +195,8 @@
     //trim the buffer
     String s = buffer.toString().trim();
     reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
-    reusableToken.setStartOffset(theStart);
-    reusableToken.setEndOffset(theStart + s.length());
+    reusableToken.setStartOffset(input.correctOffset(theStart));
+    reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
     reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -229,8 +234,8 @@
     //trim the buffer
     String s = buffer.toString().trim();
     reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
-    reusableToken.setStartOffset(theStart);
-    reusableToken.setEndOffset(theStart + s.length());
+    reusableToken.setStartOffset(input.correctOffset(theStart));
+    reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
     reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -243,8 +248,8 @@
   private void setupToken(final Token reusableToken) {
     scanner.getText(reusableToken);
     final int start = scanner.yychar();
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(start + reusableToken.termLength());
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
   }
 
   /*
@@ -258,7 +263,7 @@
   }
 
   public void reset(Reader reader) throws IOException {
-    input = reader;
+    setInput( reader );
     reset();
   }
 
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java	(revision 755894)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java	(working copy)
@@ -55,7 +55,7 @@
 
 
     public ChineseTokenizer(Reader in) {
-        input = in;
+      super(in);
     }
 
     private int offset = 0, bufferIndex=0, dataLen=0;
@@ -81,7 +81,7 @@
         if (length>0) {
             //System.out.println(new String(buffer, 0,
             //length));
-          return token.reinit(buffer, 0, length, start, start+length);
+          return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
         }
         else
             return null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(revision 755894)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(working copy)
@@ -85,6 +85,6 @@
 
     int oldPos = pos;
     pos++;
-    return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
+    return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
   }
 }
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java	(revision 755894)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java	(working copy)
@@ -140,8 +140,8 @@
     int start = side == Side.FRONT ? 0 : inLen - gramSize;
     int end = start + gramSize;
     reusableToken.setTermBuffer(inStr, start, gramSize);
-    reusableToken.setStartOffset(start);
-    reusableToken.setEndOffset(end);
+    reusableToken.setStartOffset(input.correctOffset(start));
+    reusableToken.setEndOffset(input.correctOffset(end));
     gramSize++;
     return reusableToken;
   }
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java	(revision 755894)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java	(working copy)
@@ -85,7 +85,7 @@
      * @param in I/O reader
      */
     public CJKTokenizer(Reader in) {
-        input = in;
+      super(in);
     }
 
     //~ Methods ----------------------------------------------------------------
@@ -239,6 +239,6 @@
             }
         }
 
-        return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
+        return reusableToken.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), tokenType);
     }
 }
