Index: lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip
===================================================================
--- lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip	(revision 1232535)
+++ lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip	(working copy)

Property changes on: lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
Index: lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java	(revision 1232535)
+++ lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java	(working copy)
@@ -734,5 +734,16 @@
       dir.close();
     }
   }
+  
+  public static final String surrogatesIndexName = "index.36.surrogates.zip";
 
+  public void testSurrogates() throws Exception {
+    File oldIndexDir = _TestUtil.getTempDir("surrogates");
+    _TestUtil.unzip(getDataFile(surrogatesIndexName), oldIndexDir);
+    Directory dir = newFSDirectory(oldIndexDir);
+    // TODO: more tests
+    _TestUtil.checkIndex(dir);
+    dir.close();
+  }
+
 }
Index: lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java	(revision 1232535)
+++ lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java	(working copy)
@@ -18,6 +18,7 @@
  */
 
 import java.io.IOException;
+import java.util.Comparator;
 
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
@@ -365,4 +366,9 @@
     IOUtils.close(tvx, tvd, tvf);
     tvx = tvd = tvf = null;
   }
+
+  @Override
+  public Comparator<BytesRef> getComparator() throws IOException {
+    return BytesRef.getUTF8SortedAsUnicodeComparator();
+  }
 }
Index: lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
===================================================================
--- lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java	(revision 1232652)
+++ lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java	(working copy)
@@ -18,6 +18,7 @@
  */
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
@@ -265,11 +266,13 @@
   private class TVTerms extends Terms {
     private final int numTerms;
     private final long tvfFPStart;
+    private final boolean unicodeSortOrder;
 
     public TVTerms(long tvfFP) throws IOException {
       tvf.seek(tvfFP);
       numTerms = tvf.readVInt();
       tvfFPStart = tvf.getFilePointer();
+      unicodeSortOrder = sortTermsByUnicode();
     }
 
     @Override
@@ -283,7 +286,7 @@
       } else {
         termsEnum = new TVTermsEnum();
       }
-      termsEnum.reset(numTerms, tvfFPStart);
+      termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder);
       return termsEnum;
     }
 
@@ -310,28 +313,33 @@
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // TODO: really indexer hardwires
-      // this...?  I guess codec could buffer and re-sort...
-      return BytesRef.getUTF8SortedAsUnicodeComparator();
+      if (unicodeSortOrder) {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      } else {
+        return BytesRef.getUTF8SortedAsUTF16Comparator();
+      }
     }
   }
 
+  static class TermAndPostings {
+    BytesRef term;
+    int freq;
+    int[] positions;
+    int[] startOffsets;
+    int[] endOffsets;
+  }
+  
   private class TVTermsEnum extends TermsEnum {
+    private boolean unicodeSortOrder;
     private final IndexInput origTVF;
     private final IndexInput tvf;
     private int numTerms;
-    private int nextTerm;
-    private int freq;
-    private BytesRef lastTerm = new BytesRef();
-    private BytesRef term = new BytesRef();
+    private int currentTerm;
     private boolean storePositions;
     private boolean storeOffsets;
-    private long tvfFP;
+    
+    private TermAndPostings[] termAndPostings;
 
-    private int[] positions;
-    private int[] startOffsets;
-    private int[] endOffsets;
-
     // NOTE: tvf is pre-positioned by caller
     public TVTermsEnum() throws IOException {
       this.origTVF = Lucene3xTermVectorsReader.this.tvf;
@@ -342,37 +350,81 @@
       return tvf == origTVF;
     }
 
-    public void reset(int numTerms, long tvfFPStart) throws IOException {
+    public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException {
       this.numTerms = numTerms;
-      nextTerm = 0;
+      currentTerm = -1;
       tvf.seek(tvfFPStart);
       final byte bits = tvf.readByte();
       storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
       storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
-      tvfFP = 1+tvfFPStart;
-      positions = null;
-      startOffsets = null;
-      endOffsets = null;
+      this.unicodeSortOrder = unicodeSortOrder;
+      readVectors();
+      if (unicodeSortOrder) {
+        Arrays.sort(termAndPostings, new Comparator<TermAndPostings>() {
+          public int compare(TermAndPostings left, TermAndPostings right) {
+            return left.term.compareTo(right.term);
+          }
+        });
+      }
     }
+    
+    private void readVectors() throws IOException {
+      termAndPostings = new TermAndPostings[numTerms];
+      BytesRef lastTerm = new BytesRef();
+      for (int i = 0; i < numTerms; i++) {
+        TermAndPostings t = new TermAndPostings();
+        BytesRef term = new BytesRef();
+        term.copyBytes(lastTerm);
+        final int start = tvf.readVInt();
+        final int deltaLen = tvf.readVInt();
+        term.length = start + deltaLen;
+        term.grow(term.length);
+        tvf.readBytes(term.bytes, start, deltaLen);
+        t.term = term;
+        int freq = tvf.readVInt();
+        t.freq = freq;
+        
+        if (storePositions) {
+          int positions[] = new int[freq];
+          int pos = 0;
+          for(int posUpto=0;posUpto<freq;posUpto++) {
+            pos += tvf.readVInt();
+            positions[posUpto] = pos;
+          }
+          t.positions = positions;
+        }
 
+        if (storeOffsets) {
+          int startOffsets[] = new int[freq];
+          int endOffsets[] = new int[freq];
+          int offset = 0;
+          for(int posUpto=0;posUpto<freq;posUpto++) {
+            startOffsets[posUpto] = offset + tvf.readVInt();
+            offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
+          }
+          t.startOffsets = startOffsets;
+          t.endOffsets = endOffsets;
+        }
+        lastTerm.copyBytes(term);
+        termAndPostings[i] = t;
+      }
+    }
+
     // NOTE: slow!  (linear scan)
     @Override
-    public SeekStatus seekCeil(BytesRef text, boolean useCache)
-      throws IOException {
-      if (nextTerm != 0 && text.compareTo(term) < 0) {
-        nextTerm = 0;
-        tvf.seek(tvfFP);
-      }
-
-      while (next() != null) {
-        final int cmp = text.compareTo(term);
+    public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
+      Comparator<BytesRef> comparator = getComparator();
+      for (int i = 0; i < numTerms; i++) {
+        int cmp = comparator.compare(text, termAndPostings[i].term);
         if (cmp < 0) {
+          currentTerm = i;
           return SeekStatus.NOT_FOUND;
         } else if (cmp == 0) {
+          currentTerm = i;
           return SeekStatus.FOUND;
         }
       }
-
+      currentTerm = termAndPostings.length;
       return SeekStatus.END;
     }
 
@@ -383,47 +435,15 @@
 
     @Override
     public BytesRef next() throws IOException {
-      if (nextTerm >= numTerms) {
+      if (++currentTerm >= numTerms) {
         return null;
       }
-      term.copyBytes(lastTerm);
-      final int start = tvf.readVInt();
-      final int deltaLen = tvf.readVInt();
-      term.length = start + deltaLen;
-      term.grow(term.length);
-      tvf.readBytes(term.bytes, start, deltaLen);
-      freq = tvf.readVInt();
-
-      if (storePositions) {
-        // TODO: we could maybe reuse last array, if we can
-        // somehow be careful about consumer never using two
-        // D&PEnums at once...
-        positions = new int[freq];
-        int pos = 0;
-        for(int posUpto=0;posUpto<freq;posUpto++) {
-          pos += tvf.readVInt();
-          positions[posUpto] = pos;
-        }
-      }
-
-      if (storeOffsets) {
-        startOffsets = new int[freq];
-        endOffsets = new int[freq];
-        int offset = 0;
-        for(int posUpto=0;posUpto<freq;posUpto++) {
-          startOffsets[posUpto] = offset + tvf.readVInt();
-          offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
-        }
-      }
-
-      lastTerm.copyBytes(term);
-      nextTerm++;
-      return term;
+      return term();
     }
 
     @Override
     public BytesRef term() {
-      return term;
+      return termAndPostings[currentTerm].term;
     }
 
     @Override
@@ -438,7 +458,7 @@
 
     @Override
     public long totalTermFreq() {
-      return freq;
+      return termAndPostings[currentTerm].freq;
     }
 
     @Override
@@ -449,7 +469,7 @@
       } else {
         docsEnum = new TVDocsEnum();
       }
-      docsEnum.reset(liveDocs, freq);
+      docsEnum.reset(liveDocs, termAndPostings[currentTerm]);
       return docsEnum;
     }
 
@@ -469,15 +489,17 @@
       } else {
         docsAndPositionsEnum = new TVDocsAndPositionsEnum();
       }
-      docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
+      docsAndPositionsEnum.reset(liveDocs, termAndPostings[currentTerm]);
       return docsAndPositionsEnum;
     }
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // TODO: really indexer hardwires
-      // this...?  I guess codec could buffer and re-sort...
-      return BytesRef.getUTF8SortedAsUnicodeComparator();
+      if (unicodeSortOrder) {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      } else {
+        return BytesRef.getUTF8SortedAsUTF16Comparator();
+      }
     }
   }
 
@@ -518,9 +540,9 @@
       }
     }
 
-    public void reset(Bits liveDocs, int freq) {
+    public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
       this.liveDocs = liveDocs;
-      this.freq = freq;
+      this.freq = termAndPostings.freq;
       this.doc = -1;
       didNext = false;
     }
@@ -569,11 +591,11 @@
       }
     }
 
-    public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
+    public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
       this.liveDocs = liveDocs;
-      this.positions = positions;
-      this.startOffsets = startOffsets;
-      this.endOffsets = endOffsets;
+      this.positions = termAndPostings.positions;
+      this.startOffsets = termAndPostings.startOffsets;
+      this.endOffsets = termAndPostings.endOffsets;
       this.doc = -1;
       didNext = false;
       nextPos = 0;
@@ -668,5 +690,14 @@
       }
     }
   }
+  
+  // If this returns, we do the surrogates shuffle so that the
+  // terms are sorted by unicode sort order.  This should be
+  // true when segments are used for "normal" searching;
+  // it's only false during testing, to create a pre-flex
+  // index, using the test-only PreFlexRW.
+  protected boolean sortTermsByUnicode() {
+    return true;
+  }
 }
 
Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java	(revision 1232535)
+++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java	(working copy)
@@ -18,6 +18,7 @@
  */
 
 import java.io.IOException;
+import java.util.Comparator;
 
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.index.FieldInfo;
@@ -170,6 +171,11 @@
     }
   }
   
+  @Override
+  public Comparator<BytesRef> getComparator() throws IOException {
+    return BytesRef.getUTF8SortedAsUnicodeComparator();
+  }
+  
   private void write(String s) throws IOException {
     SimpleTextUtil.write(out, s, scratch);
   }
Index: lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java	(revision 1232535)
+++ lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java	(working copy)
@@ -19,6 +19,7 @@
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.Comparator;
 
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
@@ -273,4 +274,8 @@
       assert termCount == numTerms;
     }
   }
+  
+  /** Return the BytesRef Comparator used to sort terms
+   *  before feeding to this API. */
+  public abstract Comparator<BytesRef> getComparator() throws IOException;
 }
Index: lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java	(revision 1232535)
+++ lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java	(working copy)
@@ -118,9 +118,7 @@
     TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
     final TermVectorsWriter tv = termsWriter.writer;
 
-    // TODO: we may want to make this sort in same order
-    // as Codec's terms dict?
-    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
+    final int[] termIDs = termsHashPerField.sortPostings(tv.getComparator());
 
     tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets);
     
Index: lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java
===================================================================
--- lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java	(revision 1232652)
+++ lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java	(working copy)
@@ -18,6 +18,7 @@
  */
 
 import java.io.IOException;
+import java.util.Comparator;
 
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
@@ -32,7 +33,6 @@
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
 
-// TODO: surrogates dance!
 public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
   private final Directory directory;
   private final String segment;
@@ -213,4 +213,9 @@
     IOUtils.close(tvx, tvd, tvf);
     tvx = tvd = tvf = null;
   }
+  
+  @Override
+  public Comparator<BytesRef> getComparator() throws IOException {
+    return BytesRef.getUTF8SortedAsUTF16Comparator();
+  }
 }
Index: lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java
===================================================================
--- lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java	(revision 1232652)
+++ lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java	(working copy)
@@ -19,10 +19,15 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat;
+import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.LuceneTestCase;
 
 public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
 
@@ -30,4 +35,30 @@
   public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
     return new PreFlexRWTermVectorsWriter(directory, segment, context);
   }
+
+  @Override
+  public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
+    return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context) {
+      @Override
+      protected boolean sortTermsByUnicode() {
+        // We carefully peek into stack track above us: if
+        // we are part of a "merge", we must sort by UTF16:
+        boolean unicodeSortOrder = true;
+
+        StackTraceElement[] trace = new Exception().getStackTrace();
+        for (int i = 0; i < trace.length; i++) {
+          //System.out.println(trace[i].getClassName());
+          if ("merge".equals(trace[i].getMethodName())) {
+            unicodeSortOrder = false;
+            if (LuceneTestCase.VERBOSE) {
+              System.out.println("NOTE: PreFlexRW codec: forcing legacy UTF16 vector term sort order");
+            }
+            break;
+          }
+        }
+
+        return unicodeSortOrder;
+      }
+    };
+  }
 }
