Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(révision 1490549)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(copie de travail)
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -115,23 +116,59 @@
     checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
   }
 
-  private void testNGrams(int minGram, int maxGram, int length) throws IOException {
+  private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
     final String s = RandomStrings.randomAsciiOfLength(random(), length);
-    final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
+    testNGrams(minGram, maxGram, s, nonTokenChars);
+  }
+
+  private static int[] toCodePoints(CharSequence s) {
+    final int[] codePoints = new int[Character.codePointCount(s, 0, s.length())];
+    for (int i = 0, j = 0; i < s.length(); ++j) {
+      codePoints[j] = Character.codePointAt(s, i);
+      i += Character.charCount(codePoints[j]);
+    }
+    return codePoints;
+  }
+
+  private static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars) throws IOException {
+    // convert the string to code points
+    final int[] codePoints = toCodePoints(s);
+    final int[] offsets = new int[codePoints.length + 1];
+    for (int i = 0; i < codePoints.length; ++i) {
+      offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
+    }
+    final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram) {
+      @Override
+      protected boolean isTokenChar(int chr) {
+        return nonTokenChars.indexOf(chr) < 0;
+      }
+    };
     final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
     final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
     final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
     final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
     grams.reset();
-    for (int start = 0; start < s.length(); ++start) {
-      for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
+    for (int start = 0; start < codePoints.length; ++start) {
+      nextGram:
+      for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
+        for (int i = 0; i < nonTokenChars.length(); ) {
+          final int cp = nonTokenChars.codePointAt(i);
+          for (int j = start; j < end; ++j) {
+            if (codePoints[j] == cp) {
+              continue nextGram;
+            }
+          }
+          i += Character.charCount(cp);
+        }
         assertTrue(grams.incrementToken());
-        assertEquals(s.substring(start, end), termAtt.toString());
+        assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
         assertEquals(1, posIncAtt.getPositionIncrement());
-        assertEquals(start, offsetAtt.startOffset());
-        assertEquals(end, offsetAtt.endOffset());
+        assertEquals(1, posLenAtt.getPositionLength());
+        assertEquals(offsets[start], offsetAtt.startOffset());
+        assertEquals(offsets[end], offsetAtt.endOffset());
       }
     }
+    assertFalse(grams.incrementToken());
     grams.end();
     assertEquals(s.length(), offsetAtt.startOffset());
     assertEquals(s.length(), offsetAtt.endOffset());
@@ -141,14 +178,47 @@
     // test sliding
     final int minGram = _TestUtil.nextInt(random(), 1, 100);
     final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
-    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
   }
 
   public void testLargeMaxGram() throws IOException {
     // test sliding with maxGram > 1024
-    final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
+    final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
     final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
-    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
   }
 
+  public void testPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
+  }
+
+  public void testHeavyPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
+  }
+
+  public void testFewTokenChars() throws IOException {
+    final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
+    Arrays.fill(chrs, ' ');
+    for (int i = 0; i < chrs.length; ++i) {
+      if (random().nextFloat() < 0.1) {
+        chrs[i] = 'a';
+      }
+    }
+    final int minGram = _TestUtil.nextInt(random(), 1, 2);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
+    testNGrams(minGram, maxGram, new String(chrs), " ");
+  }
+
+  public void testFullUTF8Range() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
+    testNGrams(minGram, maxGram, s, "");
+    testNGrams(minGram, maxGram, s, "abcdef");
+  }
+
 }
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(révision 1490549)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(copie de travail)
@@ -40,23 +40,33 @@
  * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
  * </table>
  * <a name="version"/>
- * <p>Before Lucene 4.4, this class had a different behavior:<ul>
- * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
- * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
- * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
- * <p>Although highly discouraged, it is still possible to use the old behavior
- * through {@link Lucene43NGramTokenizer}.
+ * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+ * <li>tokenize in a streaming fashion to support streams which are larger
+ * than 1024 chars (limit of the previous version),
+ * <li>count grams based on unicode code points instead of java chars (and
+ * never split in the middle of surrogate pairs),
+ * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
+ * before computing n-grams.</ul>
+ * <p>Additionally, this class doesn't trim trailing whitespaces and emits
+ * tokens in a different order, tokens are now emitted by increasing start
+ * offsets while they used to be emitted by increasing lengths (which prevented
+ * from supporting large input streams).
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43NGramTokenizer}.
  */
-public final class NGramTokenizer extends Tokenizer {
+public class NGramTokenizer extends Tokenizer {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 
-  private char[] buffer;
-  private int bufferStart, bufferEnd; // remaining slice of the buffer
+  private char[] charBuffer;
+  private int[] buffer; // like charBuffer, but converted to code points
+  private int bufferStart, bufferEnd; // remaining slice in buffer
   private int offset;
   private int gramSize;
   private int minGram, maxGram;
   private boolean exhausted;
+  private int lastCheckedChar; // last offset in the buffer that we checked
+  private int lastNonTokenChar; // last offset that we found to not be a token char
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
@@ -109,64 +119,141 @@
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
-    buffer = new char[maxGram + 1024];
+    charBuffer = new char[2 * maxGram + 1024]; // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+    buffer = new int[charBuffer.length];
+    // Make the term att large enough
+    termAtt.resizeBuffer(2 * maxGram);
   }
 
-  /** Returns the next token in the stream, or null at EOS. */
   @Override
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
     clearAttributes();
 
-    // compact
-    if (bufferStart >= buffer.length - maxGram) {
-      System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
-      bufferEnd -= bufferStart;
-      bufferStart = 0;
+    while (true) {
+      // compact
+      if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
+        System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+        bufferEnd -= bufferStart;
+        lastCheckedChar -= bufferStart;
+        lastNonTokenChar -= bufferStart;
+        bufferStart = 0;
 
-      // fill in remaining space
-      if (!exhausted) {
-        // TODO: refactor to a shared readFully
-        while (bufferEnd < buffer.length) {
-          final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
-          if (read == -1) {
-            exhausted = true;
-            break;
+        // fill in remaining space
+        final int read = read(buffer.length - bufferEnd);
+        // convert to code points
+        for (int i = 0; i < read; ++bufferEnd) {
+          final int charCount;
+          if (i == read - 1) {
+            // don't try to decode a code point, there is necessarily a single char
+            charCount = 1;
+            buffer[bufferEnd] = (charBuffer[i]);
+          } else {
+            final int cp = Character.codePointAt(charBuffer, i);
+            charCount = Character.charCount(cp);
+            buffer[bufferEnd] = cp;
           }
-          bufferEnd += read;
+          i += charCount;
         }
       }
+
+      // should we go to the next offset?
+      if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
+        if (bufferStart + 1 + minGram > bufferEnd) {
+          assert exhausted;
+          return false;
+        }
+        consume();
+        gramSize = minGram;
+      }
+
+      updateLastNonTokenChar();
+
+      // retry if the token to be emitted was going to not only contain token chars
+      if (lastNonTokenChar >= bufferStart && lastNonTokenChar < bufferStart + gramSize) {
+        consume();
+        gramSize = minGram;
+        continue;
+      }
+
+      final char[] termAttBuffer = termAtt.buffer();
+      int length = 0;
+      for (int i = 0; i < gramSize; ++i) {
+        final int cp = buffer[bufferStart + i];
+        // it's ok to do so since we resized the char term att in the constructor
+        Character.toChars(cp, termAttBuffer, length);
+        length += Character.charCount(cp);
+      }
+      termAtt.setLength(length);
+      posIncAtt.setPositionIncrement(1);
+      posLenAtt.setPositionLength(1);
+      offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+      ++gramSize;
+      return true;
     }
+  }
 
-    // should we go to the next offset?
-    if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
-      bufferStart++;
-      offset++;
-      gramSize = minGram;
+  private void updateLastNonTokenChar() {
+    final int termEnd = bufferStart + gramSize - 1;
+    if (termEnd > lastCheckedChar) {
+      for (int i = termEnd; i > lastCheckedChar; --i) {
+        if (!isTokenChar(buffer[i])) {
+          lastNonTokenChar = i;
+          break;
+        }
+      }
+      lastCheckedChar = termEnd;
     }
+  }
 
-    // are there enough chars remaining?
-    if (bufferStart + gramSize > bufferEnd) {
-      return false;
+  private int read(int upTo) throws IOException {
+    int read = 0;
+    // read up to upTo - 1 chars
+    while (read < upTo - 1) {
+      final int r = input.read(charBuffer, read, upTo - read - 1);
+      if (r == -1) {
+        exhausted = true;
+        break;
+      }
+      read += r;
     }
+    assert read < upTo;
+    // if the last char is a high surrogate, read one char again
+    if (read > 0 && Character.isHighSurrogate(charBuffer[read - 1])) {
+      final int lowSurrogate = exhausted ? -1 : input.read();
+      if (lowSurrogate != -1) {
+        charBuffer[read] = (char) lowSurrogate;
+        ++read;
+      } // else: broken input stream!
+    }
+    return read;
+  }
 
-    termAtt.copyBuffer(buffer, bufferStart, gramSize);
-    posIncAtt.setPositionIncrement(1);
-    posLenAtt.setPositionLength(1);
-    offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
-    ++gramSize;
+  /** Consume one code point. */
+  private void consume() {
+    offset += Character.charCount(buffer[bufferStart++]);
+  }
+
+  /** Only collect characters which satisfy this condition. */
+  protected boolean isTokenChar(int chr) {
     return true;
   }
 
   @Override
-  public void end() {
-    final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+  public final void end() {
+    assert bufferStart <= bufferEnd;
+    int endOffset = offset;
+    for (int i = bufferStart; i < bufferEnd; ++i) {
+      endOffset += Character.charCount(buffer[i]);
+    }
+    endOffset = correctOffset(endOffset);
     offsetAtt.setOffset(endOffset, endOffset);
   }
 
   @Override
-  public void reset() throws IOException {
+  public final void reset() throws IOException {
     super.reset();
     bufferStart = bufferEnd = buffer.length;
+    lastNonTokenChar = lastCheckedChar = bufferStart - 1;
     offset = 0;
     gramSize = minGram;
     exhausted = false;
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java	(révision 1490549)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java	(copie de travail)
@@ -42,6 +42,10 @@
  * {@link Version#LUCENE_44} in the constructor but this is not recommended as
  * it will lead to broken {@link TokenStream}s that will cause highlighting
  * bugs.
+ * <p>If you were using this {@link TokenFilter} to perform partial highlighting,
+ * this won't work anymore since this filter doesn't update offsets. You should
+ * modify your analysis chain to use {@link NGramTokenizer}, and potentially
+ * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
  */
 public final class NGramTokenFilter extends TokenFilter {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt	(révision 1490549)
+++ lucene/CHANGES.txt	(copie de travail)
@@ -47,6 +47,10 @@
   (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
   whitespaces. (Adrien Grand)
 
+* LUCENE-5042: NGramTokenizer now correctly handles supplementary characters
+  and has the ability to pre-tokenize the input reader similarly to
+  CharTokenizer. (Adrien Grand)
+
 * LUCENE-4967: NRTManager is replaced by
   ControlledRealTimeReopenThread, for controlling which requests must
   see which indexing changes, so that it can work with any
