Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java	(revision 0)
+++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java	(working copy)
@@ -0,0 +1,146 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+
+public class TestGraphTokenizers extends LuceneTestCase {
+
+  private static Token token(String term, int posInc, int posLength) {
+    final Token t = new Token(term, 0, 0);
+    t.setPositionIncrement(posInc);
+    t.setPositionLength(posLength);
+    return t;
+  }
+
+  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+    final Token t = new Token(term, startOffset, endOffset);
+    t.setPositionIncrement(posInc);
+    t.setPositionLength(posLength);
+    return t;
+  }
+
+  public void testSingleToken() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+      });
+    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+    final Automaton expected = BasicAutomata.makeString("abc");
+    assertTrue(BasicOperations.sameLanguage(expected, actual));
+  }
+
+  // for debugging!
+  private static void toDot(Automaton a) throws IOException {
+    final String s = a.toDot();
+    Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
+    w.write(s);
+    w.close();
+  }
+
+  private static final Automaton POS_SEP = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP,
+                                                                       TokenStreamToAutomaton.POS_SEP);
+  public void testTwoTokens() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("def", 1, 1),
+      });
+    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+    final Automaton a1 = BasicAutomata.makeString("abc");
+    final Automaton a2 = BasicAutomata.makeString("def");
+    final Automaton expected =  BasicOperations.concatenate(a1, POS_SEP, a2);
+
+    //toDot(actual);
+    assertTrue(BasicOperations.sameLanguage(expected, actual));
+  }
+
+  public void testOverlappedTokensSausage() throws Exception {
+
+    // Two tokens on top of each other (sausage):
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 1)
+      });
+    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+    final Automaton a1 = BasicAutomata.makeString("abc");
+    final Automaton a2 = BasicAutomata.makeString("xyz");
+    final Automaton expected = BasicOperations.union(a1, a2);
+    assertTrue(BasicOperations.sameLanguage(expected, actual));
+  }
+
+  public void testOverlappedTokensLattice() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 2),
+        token("def", 1, 1),
+      });
+    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+    final Automaton a1 = BasicAutomata.makeString("xyz");
+    final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
+                                                     POS_SEP,
+                                                     BasicAutomata.makeString("def"));
+    final Automaton expected = BasicOperations.union(a1, a2);
+    //toDot(actual);
+    assertTrue(BasicOperations.sameLanguage(expected, actual));
+  }
+
+  public void testOverlappedTokensLattice2() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 3),
+        token("def", 1, 1),
+        token("ghi", 1, 1),
+      });
+    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+    final Automaton a1 = BasicAutomata.makeString("xyz");
+    final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
+                                                     POS_SEP,
+                                                     BasicAutomata.makeString("def"),
+                                                     POS_SEP,
+                                                     BasicAutomata.makeString("ghi"));
+    final Automaton expected = BasicOperations.union(a1, a2);
+    //toDot(actual);
+    assertTrue(BasicOperations.sameLanguage(expected, actual));
+  }
+
+  public void testToDot() throws Exception {
+    final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
+    StringWriter w = new StringWriter();
+    new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
+    assertTrue(w.toString().indexOf("abc / abcd") != -1);
+  }
+}
+

Property changes on: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/core/src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/Token.java	(revision 1296808)
+++ lucene/core/src/java/org/apache/lucene/analysis/Token.java	(working copy)
@@ -22,6 +22,7 @@
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
@@ -121,13 +122,15 @@
 */
 public class Token extends CharTermAttributeImpl 
                    implements TypeAttribute, PositionIncrementAttribute,
-                              FlagsAttribute, OffsetAttribute, PayloadAttribute {
+                              FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {
 
   private int startOffset,endOffset;
   private String type = DEFAULT_TYPE;
   private int flags;
   private Payload payload;
   private int positionIncrement = 1;
+  // nocommit should we NOT do this...?
+  private int positionLength = 1;
 
   /** Constructs a Token will null text. */
   public Token() {
@@ -270,6 +273,18 @@
     return positionIncrement;
   }
 
+  // nocommit jdocs
+  @Override
+  public void setPositionLength(int positionLength) {
+    this.positionLength = positionLength;
+  }
+
+  // nocommit jdocs
+  @Override
+  public int getPositionLength() {
+    return positionLength;
+  }
+
   /** Returns this Token's starting offset, the position of the first character
     corresponding to this token in the source text.
 
Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java	(revision 1296808)
+++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java	(working copy)
@@ -90,8 +90,8 @@
    * <p>
    * Complexity: linear in total number of states.
    */
-  static public Automaton concatenate(List<Automaton> l) {
-    if (l.isEmpty()) return BasicAutomata.makeEmptyString();
+  static public Automaton concatenate(Automaton... l) {
+    if (l.length == 0) return BasicAutomata.makeEmptyString();
     boolean all_singleton = true;
     for (Automaton a : l)
       if (!a.isSingleton()) {
@@ -109,8 +109,8 @@
       Set<Integer> ids = new HashSet<Integer>();
       for (Automaton a : l)
         ids.add(System.identityHashCode(a));
-      boolean has_aliases = ids.size() != l.size();
-      Automaton b = l.get(0);
+      boolean has_aliases = ids.size() != l.length;
+      Automaton b = l[0];
       if (has_aliases) b = b.cloneExpanded();
       else b = b.cloneExpandedIfRequired();
       Set<State> ac = b.getAcceptStates();
@@ -191,7 +191,7 @@
     while (min-- > 0)
       as.add(a);
     as.add(repeat(a));
-    return concatenate(as);
+    return concatenate(as.toArray(new Automaton[as.size()]));
   }
   
   /**
@@ -213,7 +213,7 @@
       List<Automaton> as = new ArrayList<Automaton>();
       while (min-- > 0)
         as.add(a);
-      b = concatenate(as);
+      b = concatenate(as.toArray(new Automaton[as.size()]));
     }
     if (max > 0) {
       Automaton d = a.clone();
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java	(revision 1296808)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java	(working copy)
@@ -22,6 +22,7 @@
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 
 /**
  * emits a canned set of {@link Token}
@@ -31,6 +32,7 @@
   private int upto = 0;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   
   public CannedTokenStream(Token[] tokens) {
@@ -47,6 +49,7 @@
       termAtt.setEmpty();
       termAtt.append(token.toString());
       posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+      posLengthAtt.setPositionLength(token.getPositionLength());
       offsetAtt.setOffset(token.startOffset(), token.endOffset());
       return true;
     } else {
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java	(revision 0)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java	(working copy)
@@ -0,0 +1,134 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+/** Consumes a TokenStream and creates an {@link Automaton}. */
+public class TokenStreamToAutomaton {
+
+  // nocommit: what bytes to steal!
+
+  // We create transition w/ this label when posInc is 1:
+  public static final int POS_SEP = 0;
+
+  // nocommit move to oal.util.automaton?
+  // nocommit: toFST?  then we can translate atts into FST weights
+
+  /** Pulls the graph (including {@link
+   *  PositionLengthAttribute}) from the provided {@link
+   *  TokenStream}, and creates the corresponding
+   *  automaton where arcs are bytes from each term. */
+  public static Automaton toAutomaton(TokenStream in) throws IOException {
+    final Automaton a = new Automaton();
+
+    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
+    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
+    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+    final BytesRef term = termBytesAtt.getBytesRef();
+
+    in.reset();
+
+    // Only temporarily holds states ahead of our current
+    // position:
+    // nocommit maybe linked list...?
+    final Map<Integer,State> posToState = new HashMap<Integer,State>();
+
+    State currentFromState = null;
+    int pos = -1;
+    int lastEndPos = -1;
+    while (in.incrementToken()) {
+      int posInc = posIncAtt.getPositionIncrement();
+      if (currentFromState == null && posInc == 0) {
+        // TODO: hmm are TS's still allowed to do this...?
+        posInc = 1;
+      }
+
+      if (posInc > 0) {
+        // New node:
+        pos += posInc;
+        final State nextFromState;
+        final State lastEndState = posToState.get(pos);
+        if (lastEndState == null) {
+          // nocommit invalid assert!!  if a syn matched
+          // over what is now a hole this assert falsely
+          // trips... make test!!
+          assert currentFromState == null;
+          nextFromState = a.getInitialState();
+        } else {
+          nextFromState = new State();
+          posToState.remove(pos);
+          // nocommit if posInc > 1 what to do...?  multiple SEP?
+          lastEndState.addTransition(new Transition(POS_SEP, nextFromState));
+        }
+        currentFromState = nextFromState;
+      }
+
+      // nocommit: make test for this:
+
+      // nocommit does posLengthAtt make it possible to
+      // create broken graph?  ie what if posInc skips over
+      // the node created by a previous posLengthAtt!?
+      // hrm.  actually: we must handle this case!  it means
+      // eg a syn matched a stop word but then stop word was
+      // deleted...
+
+      final int endPos = pos + posLengthAtt.getPositionLength();
+
+      termBytesAtt.fillBytesRef();
+      State endState = posToState.get(endPos);
+      if (endState == null) {
+        endState = new State();
+        posToState.put(endPos, endState);
+      }
+
+      State lastState = currentFromState;
+
+      for(int byteIDX=0;byteIDX<term.length;byteIDX++) {
+        final State nextState;
+        if (byteIDX == term.length-1) {
+          nextState = endState;
+        } else {
+          nextState = new State();
+        }
+        
+        lastState.addTransition(new Transition(term.bytes[term.offset + byteIDX] & 0xff, nextState));
+        lastState = nextState;
+      }
+
+      lastEndPos = pos + posLengthAtt.getPositionLength();
+    }
+
+    // nocommit is this... right?
+    for(State endState : posToState.values()) {
+      endState.setAccept(true);
+    }
+
+    return a;
+  }
+}

Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
