Index: lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,52 @@ +package org.apache.lucene.store; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class ByteArrayDataOutput extends DataOutput { + private byte[] bytes; + + private int pos; + private int limit; + + public ByteArrayDataOutput(byte[] bytes) { + reset(bytes); + } + + public ByteArrayDataOutput(byte[] bytes, int offset, int len) { + reset(bytes, offset, len); + } + + public ByteArrayDataOutput() { + reset(BytesRef.EMPTY_BYTES); + } + + public void reset(byte[] bytes) { + reset(bytes, 0, bytes.length); + } + + public void reset(byte[] bytes, int offset, int len) { + this.bytes = bytes; + pos = offset; + limit = offset + len; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + assert pos < limit; + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert pos + length <= limit; + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } +} Index: lucene/src/java/org/apache/lucene/util/CharsRef.java --- lucene/src/java/org/apache/lucene/util/CharsRef.java Thu Jun 23 10:34:27 2011 -0400 +++ lucene/src/java/org/apache/lucene/util/CharsRef.java Fri Jun 24 06:46:54 2011 -0400 @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -166,7 +168,11 @@ * the {@link CharsRef} to copy */ public void copy(CharsRef other) { - chars = ArrayUtil.grow(chars, other.length); + if (chars == null) { + chars = new char[other.length]; + } else { + chars = ArrayUtil.grow(chars, other.length); + } System.arraycopy(other.chars, other.offset, chars, 0, other.length); length = other.length; offset = 0; @@ -212,4 +218,56 @@ public CharSequence subSequence(int start, int end) { return new CharsRef(chars, offset + start, offset + end - 1); } + + private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator(); + + public static Comparator getUTF16SortedAsUTF8Comparator() { + return utf16SortedAsUTF8SortOrder; + } + + private static class UTF16SortedAsUTF8Comparator implements Comparator { + // Only singleton + private UTF16SortedAsUTF8Comparator() {}; + + public int compare(CharsRef a, CharsRef b) { + if (a == b) + return 0; + + final char[] aChars = a.chars; + int aUpto = a.offset; + final char[] bChars = b.chars; + int bUpto = b.offset; + + final int aStop = aUpto + Math.min(a.length, b.length); + + while (aUpto < aStop) { + char aChar = aChars[aUpto++]; + char bChar = bChars[bUpto++]; + if (aChar != bChar) { + // http://icu-project.org/docs/papers/utf16_code_point_order.html + + /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */ + if (aChar >= 0xd800 && bChar >= 0xd800) { + if (aChar >= 0xe000) { + aChar -= 0x800; + } else { + aChar += 0x2000; + } + + if (bChar >= 0xe000) { + bChar -= 0x800; + } else { + bChar += 0x2000; + } + } + + /* now aChar and bChar are in code point order */ + return (int)aChar - (int)bChar; /* int must be 32 bits wide */ + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } } \ No newline at end of file Index: lucene/src/test/org/apache/lucene/util/TestCharsRef.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/TestCharsRef.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,41 @@ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharsRef extends LuceneTestCase { + public void testUTF16InUTF8Order() { + final int numStrings = atLeast(1000); + BytesRef utf8[] = new BytesRef[numStrings]; + CharsRef utf16[] = new CharsRef[numStrings]; + + for (int i = 0; i < numStrings; i++) { + String s = _TestUtil.randomUnicodeString(random); + utf8[i] = new BytesRef(s); + utf16[i] = new CharsRef(s); + } + + Arrays.sort(utf8); + Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator()); + + for (int i = 0; i < numStrings; i++) { + assertEquals(utf8[i].utf8ToString(), utf16[i].toString()); + } + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilter.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,369 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +// nocommit -- explain that this greedy and what that means + +public final class FSTSynonymFilter extends TokenFilter { + private final FSTSynonymMap synonyms; + private final boolean ignoreCase; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputMatchCount; + + // How many future input tokens we've stored from past + // lookaheads: + private int inputPendingCount; + + private static class PendingInput { + AttributeSource.State state; + final CharsRef term = new CharsRef(); + boolean keepOrig; + + public void reset() { + keepOrig = false; + state = null; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + } + + public void add(char[] output, int offset, int len) { + if (upto == count) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Current position in the two rolling buffers: + private int rollUpto; + + private boolean end; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + // nocommit be sure we test no syns case... fst will be null... + + // nocommit be sure we strongly disallow empty input! hmm + // empty output is just silly too? + + /** + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, its your responsibility to lowercase + * the input entries when you create the {@link FSTSynonymMap} + */ + + public FSTSynonymFilter(TokenStream input, FSTSynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + this.fst = synonyms.fst; + + futureInputs = new PendingInput[synonyms.maxHorizontalContext]; + futureOutputs = new PendingOutputs[synonyms.maxHorizontalContext]; + for(int pos=0;pos(); + } + + /* + This is the core of this TokenFilter: it locates the + synonym matches and buffers up the results into + futureInputs/Outputs. + + NOTE: this calls input.incrementToken and does not + capture the state if no further tokens were checked. So + caller must then forward state to our caller, or capture: + */ + + private void parse() throws IOException { + System.out.println("S: parse"); + + assert inputMatchCount == 0; + + final int rollUptoLimit = rollAdd(rollUpto, inputPendingCount); + + int upto = rollUpto; + + // Holds the longest match we've seen so far: + BytesRef matchOutput = null; + int matchInputLength = 0; + + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); + + // nocommit maybe not safe...: + // nocommit make sure we test all single output syns + assert scratchArc.output == fst.outputs.getNoOutput(); + //pendingOutput = fst.outputs.add(scratchArc, scratchArc.output); + + int tokenCount = 0; + + boolean firstIncr = true; + + // nocommit -- make sure the "no matches" case doesn't + // captureState! + + // nocommit -- maybe even in the single input token + // matches, don't captureState? + + byToken: + while(true) { + + // Pull next token's chars: + final char[] buffer; + final int bufferLen; + + if (upto == rollUptoLimit) { + // We used up our lookahed + if (!input.incrementToken()) { + end = true; + break; + } else { + if (!firstIncr) { + // Must now capture input state: + final PendingInput pendingInput = futureInputs[rollUpto]; + System.out.println(" captureState slot=" + rollUpto); + rollUpto = rollIncr(rollUpto); + pendingInput.state = captureState(); + pendingInput.term.copy(termAtt.buffer(), 0, termAtt.length()); + } + firstIncr = false; + + buffer = termAtt.buffer(); + bufferLen = termAtt.length(); + } + } else { + // Still in our lookahead + buffer = futureInputs[upto].term.chars; + bufferLen = futureInputs[upto].term.length; + upto = rollIncr(upto); + } + tokenCount++; + + System.out.println(" token=" + new String(buffer, 0, bufferLen)); + + // Run each char in this token through the FST: + for(int bufUpto=0;bufUpto>> 1; + for(int outputIDX=0;outputIDX 0) { + futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); + } + lastStart = 1+chIDX; + futureInputs[outputUpto].keepOrig |= keepOrig; + outputUpto = rollIncr(outputUpto); + } + } + } + } + + // ++ mod maxHorizontalContext + private int rollIncr(int count) { + count++; + if (count == synonyms.maxHorizontalContext) { + return 0; + } else { + return count; + } + } + + // adds mod maxHorizontalContext + private int rollAdd(int x, int y) { + return (x+y) % synonyms.maxHorizontalContext; + } + + @Override + public boolean incrementToken() throws IOException { + + int pendingPosIncr = 0; + System.out.println("S: incr inputMatchCount=" + inputMatchCount); + + while(true) { + + // First play back any buffered future inputs/outputs. + while (inputMatchCount != 0) { + + // At each position, we first output the original + // token + + final PendingInput input = futureInputs[rollUpto]; + assert input.state != null; + + if (input.keepOrig) { + restoreState(input.state); + posIncrAtt.setPositionIncrement(1+pendingPosIncr); + input.reset(); + return true; + } else { + final PendingOutputs outputs = futureOutputs[rollUpto]; + if (outputs.upto < outputs.count) { + // Still have pending outputs to replay at this + // position + final CharsRef output = outputs.outputs[outputs.upto++]; + if (outputs.upto == outputs.count) { + outputs.reset(); + } + clearAttributes(); + // nocommit what other token state to apply? type? + posIncrAtt.setPositionIncrement(pendingPosIncr); + termAtt.copyBuffer(output.chars, output.offset, output.length); + return true; + } else { + inputPendingCount--; + rollUpto++; + pendingPosIncr++; + + inputMatchCount--; + } + } + } + + // Find new synonym matches: + parse(); + + if (inputMatchCount == 0) { + return !end; + } + } + } + + @Override + public void reset() throws IOException { + super.reset(); + end = false; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMap.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMap.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.fst.FST; + +/** + * @lucene.experimental + */ +public class FSTSynonymMap { + /** for multiword support, you must separate words with this separator */ + public static final char WORD_SEPARATOR = 0; + /** map> */ + public final FST fst; + /** map */ + public final BytesRefHash words; + /** maxHorizontalContext: maximum context we need on the tokenstream */ + public final int maxHorizontalContext; + /** maxVerticalContext: maximum number of synonym entries for a single input */ + public final int maxVerticalContext; + + public FSTSynonymMap(FST fst, BytesRefHash words, int maxHorizontalContext, int maxVerticalContext) { + this.fst = fst; + this.words = words; + this.maxHorizontalContext = maxHorizontalContext; + this.maxVerticalContext = maxVerticalContext; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymMapBuilder.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,168 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; + +/** + * Builds an FSTSynonymMap. + *

+ * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap + * @lucene.experimental + */ +public class FSTSynonymMapBuilder { + private final TreeMap workingSet = new TreeMap(CharsRef.getUTF16SortedAsUTF8Comparator()); + private final BytesRefHash words = new BytesRefHash(); + private final BytesRef utf8Scratch = new BytesRef(8); + private int maxHorizontalContext; + private int maxVerticalContext; + + private static class MapEntry { + boolean includeOrig; + // we could sort for better sharing ultimately, but it could confuse people + ArrayList ords = new ArrayList(); + } + + /** Sugar: just joins the provided terms with {@link + * FSTSynonymMap#WORD_SEPARATOR}. reuse and its chars + * must not be null. */ + public static CharsRef join(String[] words, CharsRef reuse) { + int upto = 0; + char[] buffer = reuse.chars; + for(String word : words) { + if (upto > 0) { + if (upto >= buffer.length) { + reuse.grow(upto); + buffer = reuse.chars; + } + buffer[upto++] = FSTSynonymMap.WORD_SEPARATOR; + } + + final int wordLen = word.length(); + final int needed = upto + wordLen; + if (needed > buffer.length) { + reuse.grow(needed); + buffer = reuse.chars; + } + + word.getChars(0, wordLen, buffer, upto); + upto += wordLen; + } + + return reuse; + } + + /** + * Add a phrase->phrase synonym mapping. + * Phrases are character sequences where words are separated with character zero (\u0000) + * + * @param input input phrase + * @param numInputWords number of input words in the input phrase + * @param output output phrase + * @param includeOrig true if the original should be included + */ + public void add(CharsRef input, int numInputWords, CharsRef output, boolean includeOrig) { + // first convert to UTF-8 + final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); + // lookup in hash + int ord = words.add(utf8Scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + + MapEntry e = workingSet.get(input); + if (e == null) { + e = new MapEntry(); + System.out.println("INPUT=" + input); + workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map + } + + e.ords.add(ord); + e.includeOrig |= includeOrig; + maxVerticalContext = Math.max(maxVerticalContext, e.ords.size()); + // nocommit must also max in the numOutputWords here too: + maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); + } + + /** + * Helper for {@link #add(CharsRef, int, CharsRef, boolean)}, except it counts + * the words in the input phrase for you. + *

+ * Chances are your parser is/can likely count this itself so it should just + * use the other method if so. + */ + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + int numInputWords = 1; + for (int i = 0; i < input.length; i++) { + if (input.charAt(i) == FSTSynonymMap.WORD_SEPARATOR) { + numInputWords++; + } + } + add(input, numInputWords, output, includeOrig); + } + + /** + * Builds an {@link FSTSynonymMap} and returns it. + */ + public FSTSynonymMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + // TODO: are we using the best sharing options? + Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + + BytesRef scratch = new BytesRef(64); + ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); + + for (Map.Entry e : workingSet.entrySet()) { + CharsRef input = e.getKey(); + MapEntry output = e.getValue(); + + int numEntries = output.ords.size(); + + // output size, assume the worst case + int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry + + scratch.grow(estimatedSize); + scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); + + // now write our output data: + scratchOutput.writeVInt(numEntries << 1 | (output.includeOrig ? 0 : 1)); + for (int i = 0; i < numEntries; i++) { + scratchOutput.writeVInt(output.ords.get(i)); + } + + scratch.length = scratchOutput.getPosition() - scratch.offset; + builder.add(input, scratch); + } + + FST fst = builder.finish(); + return new FSTSynonymMap(fst, words, maxHorizontalContext, maxVerticalContext); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFSTSynonymMapFilter.java Fri Jun 24 06:46:54 2011 -0400 @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.util.CharsRef; + +public class TestFSTSynonymMapFilter extends BaseTokenStreamTestCase { + + public void testBasic() throws Exception { + final FSTSynonymMapBuilder b = new FSTSynonymMapBuilder(); + b.add(new CharsRef("a"), 1, new CharsRef("foo"), true); + + Reader r = new StringReader("x a b"); + TokenStream tokens = new MockTokenizer(r, + MockTokenizer.WHITESPACE, + true); + tokens = new FSTSynonymFilter(tokens, + b.build(), + true); + tokens.reset(); + + final CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); + final PositionIncrementAttribute posIncrAtt = tokens.addAttribute(PositionIncrementAttribute.class); + while(tokens.incrementToken()) { + System.out.println("TOKEN: " + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + } + System.out.println("DONE"); + } +} \ No newline at end of file