Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java	(revision 414705)
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java	(working copy)
@@ -16,11 +16,15 @@
  * limitations under the License.
  */
 
+import java.util.LinkedList;
+import java.util.List;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.TokenSelector;
 import org.apache.lucene.document.*;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
@@ -54,6 +58,16 @@
     Analyzer analyzer = new WhitespaceAnalyzer();
     Similarity similarity = Similarity.getDefault();
     DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+    writer.setTermVectorTokenSelector(new TokenSelector(){
+      public boolean accept(String field, Token t) {
+        return Character.isLowerCase(t.termText().charAt(0));
+      }
+    });
+    writer.setPositionsTokenSelector(new TokenSelector(){
+      public boolean accept(String field, Token t) {
+        return Character.isLowerCase(t.termText().charAt(0));
+      }
+    });
     String segName = "test";
     writer.addDocument(segName, testDoc);
     //After adding the document, we should be able to read it back in
@@ -84,6 +98,31 @@
     fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
     assertTrue(fields != null && fields.length == 1);
     assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+    
+    fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
+    assertTrue(fields[0].isTermVectorStored());
+    TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
+    assertTrue(tv != null);
+    String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
+    String[] tvwords = tv.getTerms();
+    List uniques = new LinkedList();
+    int omitted = 0;
+    for (int i=0; i<words.length; i++)
+      if (!uniques.contains(words[i])) {
+        uniques.add(words[i]);
+        if (!Character.isLowerCase(words[i].charAt(0)))
+          omitted++;
+      }
+    assertTrue(omitted!=0);
+    assertTrue(omitted!=uniques.size());
+    assertEquals(uniques.size()-omitted, tvwords.length);
+    for (int i=0; i<uniques.size(); i++) {
+      for (int j=0; j<tvwords.length; j++)
+        if (uniques.get(i).equals(tvwords[j]))
+          assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
+    }      
 
     // test that the norm file is not present if omitNorms is true
     for (int i = 0; i < reader.fieldInfos.size(); i++) {
Index: src/java/org/apache/lucene/analysis/TokenSelector.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/TokenSelector.java	(revision 0)
@@ -0,0 +1,24 @@
+/*
+ * TokenSelector.java
+ *
+ * Created on June 13, 2006, 12:18 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+/**
+ * An interface for selecting a subset of a token stream
+ *
+ * @author Chuck Wiliams
+ */
+public interface TokenSelector {
+    
+  /** Determine if a token should be selected
+   * @param fieldName field in which token was found
+   * @param token a token
+   * @return true iff token should be selected
+   */
+  public boolean accept(String fieldName, Token token);
+    
+}
Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
===================================================================
--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
+++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java	(revision 0)
@@ -0,0 +1,44 @@
+/*
+ * PerFieldTokenSelectorWrapper.java
+ *
+ * Created on June 13, 2006, 4:09 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
+ *
+ * @author Chuck Williams
+ */
+public class PerFieldTokenSelectorWrapper implements TokenSelector {
+  
+  private Map selectors = new HashMap();
+  private TokenSelector defaultSelector;
+  
+  /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
+  public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
+    this.defaultSelector = defaultSelector;
+  }
+  
+  /** Add a token selector for the named field */
+  public void addSelector(String fieldName, TokenSelector selector) {
+    selectors.put(fieldName, selector);
+  }
+  
+  /** Determine if token is accepted by fieldName */
+  public boolean accept(String fieldName, Token token) {
+    TokenSelector selector = (TokenSelector) selectors.get(fieldName);
+    if (selector!=null)
+        return selector.accept(fieldName, token);
+    else if (defaultSelector!=null)
+        return defaultSelector.accept(fieldName, token);
+    else
+        return true;
+  }
+    
+}
\ No newline at end of file
Index: src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/IndexWriter.java	(revision 414705)
+++ src/java/org/apache/lucene/index/IndexWriter.java	(working copy)
@@ -17,6 +17,7 @@
  */
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.Directory;
@@ -100,8 +101,10 @@
    */
   public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
   
-  private Directory directory;  // where this index resides
-  private Analyzer analyzer;    // how to analyze text
+  private Directory directory;                      // where this index resides
+  private Analyzer analyzer;                        // how to analyze text
+  private TokenSelector termVectorTokenSelector;    // subset of token stream stored in term vectors
+  private TokenSelector positionsTokenSelector;     // subset of token stream for which positions are stored
 
   private Similarity similarity = Similarity.getDefault(); // how to normalize
 
@@ -153,6 +156,38 @@
     return this.similarity;
   }
 
+  /** Expert:  Set the TokenSelector used to determine subset of tokens stored in term vectors.
+   * @param selector the term vector TokenSelector
+   */
+  public void setTermVectorTokenSelector(TokenSelector selector) {
+    this.termVectorTokenSelector = selector;
+  }
+  
+  /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+   * @return the TokenSelector used to determine term vector tokens
+   */
+  public TokenSelector getTermVectorTokenSelector() {
+    return termVectorTokenSelector;
+  }
+
+  /** Expert:  Set the TokenSelector used to determine subset of tokens for which positions are stored.
+   *           (At least one position is always stored for each term in each doc to ensure the term stays in
+   *            the index so long as any docs reference it)
+   * @param selector the positions TokenSelector
+   */
+  public void setPositionsTokenSelector(TokenSelector selector) {
+    this.positionsTokenSelector = selector;
+  }
+  
+  /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
+   *          (At least one position is always stored for each term in each doc to ensure the term stays in
+   *           the index so long as any docs reference it)
+   * @return the positions TokenSelector
+   */
+  public TokenSelector getPositionsTokenSelector() {
+    return positionsTokenSelector;
+  }
+
   /** Expert: Set the interval between indexed terms.  Large values cause less
    * memory to be used by IndexReader, but slow random-access to terms.  Small
    * values cause more memory to be used by an IndexReader, and speed
@@ -471,6 +506,8 @@
   public void addDocument(Document doc, Analyzer analyzer) throws IOException {
     DocumentWriter dw =
       new DocumentWriter(ramDirectory, analyzer, this);
+    dw.setTermVectorTokenSelector(termVectorTokenSelector);
+    dw.setPositionsTokenSelector(positionsTokenSelector);
     dw.setInfoStream(infoStream);
     String segmentName = newSegmentName();
     dw.addDocument(segmentName, doc);
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java	(revision 414705)
+++ src/java/org/apache/lucene/index/DocumentWriter.java	(working copy)
@@ -17,6 +17,7 @@
  */
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
@@ -35,6 +36,8 @@
 
 final class DocumentWriter {
   private Analyzer analyzer;
+  private TokenSelector termVectorTokenSelector;
+  private TokenSelector positionsTokenSelector;
   private Directory directory;
   private Similarity similarity;
   private FieldInfos fieldInfos;
@@ -142,9 +145,9 @@
         if (!field.isTokenized()) {		  // un-tokenized field
           String stringValue = field.stringValue();
           if(field.isStoreOffsetWithTermVector())
-            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
           else
-            addPosition(fieldName, stringValue, position++, null);
+            addPosition(fieldName, stringValue, position++, null, false, false);
           offset += stringValue.length();
           length++;
         } else 
@@ -165,10 +168,16 @@
             for (Token t = stream.next(); t != null; t = stream.next()) {
               position += (t.getPositionIncrement() - 1);
               
-              if(field.isStoreOffsetWithTermVector())
-                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
-              else
-                addPosition(fieldName, t.termText(), position++, null);
+              boolean omittv = false, omitpos = false;
+              if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
+                  omittv  = true;
+              if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
+                  omitpos = true;
+              
+              addPosition(fieldName, t.termText(), position++,
+                          field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
+                                                                         : null,
+                          omittv, omitpos);
               
               lastToken = t;
               if (++length > maxFieldLength) {
@@ -196,20 +205,24 @@
 
   private final Term termBuffer = new Term("", ""); // avoid consing
 
-  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
+                                 boolean omitFromTermVector, boolean omitPosition) {
     termBuffer.set(field, text);
     //System.out.println("Offset: " + offset);
     Posting ti = (Posting) postingTable.get(termBuffer);
     if (ti != null) {				  // word seen before
       int freq = ti.freq;
-      if (ti.positions.length == freq) {	  // positions array is full
-        int[] newPositions = new int[freq * 2];	  // double size
-        int[] positions = ti.positions;
-        for (int i = 0; i < freq; i++)		  // copy old positions to new
-          newPositions[i] = positions[i];
-        ti.positions = newPositions;
+      
+      if (!omitPosition) {
+        if (ti.positions.length == freq) {        // positions array is full
+          int[] newPositions = new int[freq * 2]; // double size
+          int[] positions = ti.positions;
+          for (int i = 0; i < freq; i++)          // copy old positions to new
+            newPositions[i] = positions[i];
+          ti.positions = newPositions;
+        }
+        ti.positions[freq] = position;            // add new position
       }
-      ti.positions[freq] = position;		  // add new position
 
       if (offset != null) {
         if (ti.offsets.length == freq){
@@ -223,10 +236,12 @@
         }
         ti.offsets[freq] = offset;
       }
-      ti.freq = freq + 1;			  // update frequency
-    } else {					  // word not seen before
+      
+      if (!omitPosition)
+        ti.freq = freq + 1;                       // update frequency
+    } else {                                      // word not seen before
       Term term = new Term(field, text, false);
-      postingTable.put(term, new Posting(term, position, offset));
+      postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
     }
   }
 
@@ -351,7 +366,7 @@
             termVectorWriter.closeField();
           }
         }
-        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
+        if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
             termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
         }
       }
@@ -390,6 +405,16 @@
     this.infoStream = infoStream;
   }
 
+  /** If non-null, this will be used to select which tokens are stored in term vectors */
+  void setTermVectorTokenSelector(TokenSelector selector) {
+    this.termVectorTokenSelector = selector;
+  }
+
+  /** If non-null, this will be used to select which tokens have positions stored in the index. */
+  void setPositionsTokenSelector(TokenSelector selector) {
+    this.positionsTokenSelector = selector;
+  }
+
 }
 
 final class Posting {				  // info about a Term in a doc
@@ -397,17 +422,17 @@
   int freq;					  // its frequency in doc
   int[] positions;				  // positions it occurs at
   TermVectorOffsetInfo [] offsets;
+  boolean omitFromTermVector;                     // if true, omit from term vector
 
-  Posting(Term t, int position, TermVectorOffsetInfo offset) {
+  Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
     term = t;
     freq = 1;
     positions = new int[1];
     positions[0] = position;
-    if(offset != null){
-    offsets = new TermVectorOffsetInfo[1];
-    offsets[0] = offset;
+    if(offset != null) {
+      offsets = new TermVectorOffsetInfo[1];
+      offsets[0] = offset;
     }
-    else
-      offsets = null;
+    this.omitFromTermVector = omitFromTermVector;
   }
 }
