Index: modules/analysis/CHANGES.txt
===================================================================
--- modules/analysis/CHANGES.txt	(revision 1079618)
+++ modules/analysis/CHANGES.txt	(working copy)
@@ -4,6 +4,8 @@
    
 API Changes
 
+ * LUCENE-1227,LUCENE-2947: NGramTokenizer now handles any number of characters. Improved flexibility for dealing with whitespace. Increased emphasis put on starting and ending ngrams. (David Byrne)
+
  * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor 
    of the pattern package (CharFilter, Tokenizer, TokenFilter).  (Robert Muir)
 
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(revision 1079618)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(working copy)
@@ -19,6 +19,9 @@
 
 
 import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 
@@ -26,18 +29,16 @@
  * Tests {@link NGramTokenizer} for correctness.
  */
 public class NGramTokenizerTest extends BaseTokenStreamTestCase {
-    private StringReader input;
     
     @Override
     public void setUp() throws Exception {
         super.setUp();
-        input = new StringReader("abcde");
     }
 
     public void testInvalidInput() throws Exception {
         boolean gotException = false;
         try {        
-            new NGramTokenizer(input, 2, 1);
+            new NGramTokenizer(new StringReader("foo"), 2, 1);
         } catch (IllegalArgumentException e) {
             gotException = true;
         }
@@ -47,7 +48,7 @@
     public void testInvalidInput2() throws Exception {
         boolean gotException = false;
         try {        
-            new NGramTokenizer(input, 0, 1);
+            new NGramTokenizer(new StringReader("foo"), 0, 1);
         } catch (IllegalArgumentException e) {
             gotException = true;
         }
@@ -55,34 +56,79 @@
     }
 
     public void testUnigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 1);
+        assertTokenStreamContents(tokenizer, 
+          new String[]{"a","b","c","d","e"},
+          new int[]{0,1,2,3,4},
+          new int[]{1,2,3,4,5}, 5 /* abcde */);
     }
 
     public void testBigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
-        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 2, 2);
+        assertTokenStreamContents(tokenizer,
+          new String[]{"_a","ab","bc","cd","de","e_"},
+          new int[]{0,0,1,2,3,4},
+          new int[]{1,2,3,4,5,5}, 5 /* abcde */);
     }
 
     public void testNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 3);
         assertTokenStreamContents(tokenizer,
-          new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
-          new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+          new String[]{"a","b","c","d","e", "_a","ab","bc","cd","de","e_", "_ab","abc","bcd","cde","de_"}, 
+          new int[]{0,1,2,3,4, 0,0,1,2,3,4, 0,0,1,2,3},
+          new int[]{1,2,3,4,5, 1,2,3,4,5,5, 2,3,4,5,5},
           5 /* abcde */
         );
     }
 
     public void testOversizedNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 7, 7);
         assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
     }
     
     public void testReset() throws Exception {
-      NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-      tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+      NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcd"));
+      assertTokenStreamContents(tokenizer,
+        new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
+        new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
+        new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
+      tokenizer.reset(new StringReader("abcd"));
+      assertTokenStreamContents(tokenizer,
+        new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
+        new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
+        new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
     }
+
+    public void testInteriorWhitespace() throws Exception {
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\tb    c"),2,2);
+        assertTokenStreamContents(tokenizer,
+          new String[]{"_a","a_","_b","b_","_c","c_"}, 
+          new int[]{0,0,1,2,3,7},
+          new int[]{1,2,3,4,8,8},
+          8
+        );
+    }
+
+    public void testExteriorWhitespace() throws Exception {
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("  abc\n\n"),2,2);
+        assertTokenStreamContents(tokenizer,
+          new String[]{"__","_a","ab","bc","c_","__"}, 
+          new int[]{0,0,2,3,4,5},
+          new int[]{1,3,4,5,6,7},
+          7
+        );
+    }
+
+    public void testCustomWhitespace() throws Exception {
+        Character w[] = {'|',';'};
+        Set<Character> whitespace = new HashSet<Character>(Arrays.asList(w));
+        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a||b;c"),2,3,whitespace);
+        assertTokenStreamContents(tokenizer,
+          new String[]{"_a","a_","_b","b_","_c","c_", "_a_","a_b","_b_","b_c","_c_"}, 
+          new int[]{0,0,1,3,4,5, 0,0,1,3,4},
+          new int[]{1,2,4,5,6,6, 2,4,5,6,6},
+          6
+        );
+    }
+
 }
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(revision 1079618)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(working copy)
@@ -17,40 +17,74 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Set;
+
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.AttributeSource;
 
-import java.io.IOException;
-import java.io.Reader;
-
 /**
  * Tokenizes the input into n-grams of the given size(s).
  */
 public final class NGramTokenizer extends Tokenizer {
-  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
-  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 
-  private int minGram, maxGram;
+  public static final int DEFAULT_MIN_NGRAM_SIZE = 2;
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 4;
+    
+  public static final Set<Character> DEFAULT_WHITESPACE_CHARS;
+  static {
+    Character whitespace[] = { ' ', '\t', '\n' };
+    DEFAULT_WHITESPACE_CHARS = new HashSet<Character>(Arrays.asList(whitespace));
+  }
+  
   private int gramSize;
-  private int pos = 0;
-  private int inLen;
-  private String inStr;
-  private boolean started = false;
+  private int minGram;
+  private int maxGram;
+  private int tmp;
+
+  private LinkedList<Integer> charsQueue;
+  private LinkedList<Integer> offsetQueue;
+  private Set<Character> whitespace;
+      
+  private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private boolean collapse;
 
   /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_WHITESPACE_CHARS);
+  }
+
+  /**
    * Creates NGramTokenizer with given min and max n-grams.
    * @param input {@link Reader} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
   public NGramTokenizer(Reader input, int minGram, int maxGram) {
+    this(input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   * @param whitespace whitespace characters to be collapsed together
+   */
+  public NGramTokenizer(Reader input, int minGram, int maxGram, Set<Character> whitespace) {
     super(input);
-    init(minGram, maxGram);
+    init(minGram, maxGram, whitespace);
   }
 
   /**
@@ -61,8 +95,20 @@
    * @param maxGram the largest n-gram to generate
    */
   public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
+    this(source, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   * @param whitespace whitespace characters to be collapsed together
+   */
+  public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
     super(source, input);
-    init(minGram, maxGram);
+    init(minGram, maxGram, whitespace);
   }
 
   /**
@@ -73,19 +119,23 @@
    * @param maxGram the largest n-gram to generate
    */
   public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
-    super(factory, input);
-    init(minGram, maxGram);
+    this(factory, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
   }
 
   /**
-   * Creates NGramTokenizer with default min and max n-grams.
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
    * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   * @param whitespace whitespace characters to be collapsed together
    */
-  public NGramTokenizer(Reader input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
+    super(factory, input);
+    init(minGram, maxGram, whitespace);
   }
   
-  private void init(int minGram, int maxGram) {
+  private void init(int minGram, int maxGram, Set<Character> whitespace) {
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -94,42 +144,66 @@
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
+    this.whitespace = whitespace;
+    this.maxGram = maxGram;
+    this.minGram = minGram;
+    gramSize = minGram;
+    resetPosition();
   }
 
-  /** Returns the next token in the stream, or null at EOS. */
   @Override
   public final boolean incrementToken() throws IOException {
-    clearAttributes();
-    if (!started) {
-      started = true;
-      gramSize = minGram;
-      char[] chars = new char[1024];
-      input.read(chars);
-      inStr = new String(chars).trim();  // remove any trailing empty strings 
-      inLen = inStr.length();
+    if (gramSize > maxGram)
+      return false;
+
+    if (charsQueue.isEmpty()) {
+      charsQueue.offer((int)'_');
+      offsetQueue.offer(0);
+      for (int x = 0; x < gramSize-1; x++) {
+        int count = nextChar();
+        if (tmp == -1) {
+          count += offsetQueue.getLast()-1;
+          offsetAtt.setOffset(correctOffset(count), correctOffset(count));
+          return false;
+        } else {
+          charsQueue.offer(tmp);
+          offsetQueue.offer(offsetQueue.getLast()+count);
+        }
+      }
+    } else if (tmp == -1) {
+      ++gramSize;
+      resetPosition();
+      input.reset();
+      return incrementToken();
+    } else {
+      int count = nextChar();
+      if (tmp == -1) {
+        charsQueue.offer((int)'_');
+        offsetQueue.offer(offsetQueue.getLast()+count-1);
+      } else {
+        charsQueue.offer(tmp);
+        offsetQueue.offer(offsetQueue.getLast()+count);
+      }
+      charsQueue.poll();
+      offsetQueue.poll();
+
+      int dist = offsetQueue.get(1)-offsetQueue.getFirst()-1;
+      if (dist > 0)
+        offsetQueue.set(0, offsetQueue.getFirst()+dist);
     }
 
-    if (pos+gramSize > inLen) {            // if we hit the end of the string
-      pos = 0;                           // reset to beginning of string
-      gramSize++;                        // increase n-gram size
-      if (gramSize > maxGram)            // we are done
-        return false;
-      if (pos+gramSize > inLen)
-        return false;
+    final StringBuilder sb = new StringBuilder();
+    for (int i : charsQueue)
+      sb.append((char) i);
+
+    if (offsetQueue.getFirst() != offsetQueue.getLast()) {
+      clearAttributes();
+      termAtt.setEmpty().append(sb.toString());
+      offsetAtt.setOffset(correctOffset(offsetQueue.getFirst()), correctOffset(offsetQueue.getLast()));
+      return true;
+    } else {
+      return incrementToken();
     }
-
-    int oldPos = pos;
-    pos++;
-    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
-    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
-    return true;
-  }
-  
-  @Override
-  public final void end() {
-    // set final offset
-    final int finalOffset = inLen;
-    this.offsetAtt.setOffset(finalOffset, finalOffset);
   }    
   
   @Override
@@ -141,7 +215,35 @@
   @Override
   public void reset() throws IOException {
     super.reset();
-    started = false;
-    pos = 0;
+    gramSize = minGram;
+    resetPosition();
   }
+
+  private void resetPosition() {
+    charsQueue = new LinkedList<Integer>();
+    offsetQueue = new LinkedList<Integer>();
+    offsetQueue.offer(0);
+    tmp = 0;
+    collapse = false;
+  }
+    
+  /** Returns the number of characters read*/
+  private int nextChar() throws IOException {
+    int count = 0;
+    if (collapse) {
+      collapse = false;
+      do {
+        tmp = input.read();
+        ++count;
+      } while (whitespace.contains(new Character((char)tmp)));
+    } else {
+      tmp = input.read();
+      ++count;
+      if (whitespace.contains(new Character((char)tmp))) {
+        tmp = '_';
+        collapse = true;
+      }
+    }
+    return count;
+  }
 }
