Index: modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 1067173)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy)
@@ -26,7 +26,6 @@
import java.util.NoSuchElementException;
import java.util.Set;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
@@ -38,6 +37,8 @@
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeSource;
/**
@@ -110,13 +111,16 @@
*
*
The filter also has basic support for calculating weights for the shingles
* based on the weights of the tokens from the input stream, output shingle size, etc.
- * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
+ * See {@link #calculateShingleWeight(AttributeSource, java.util.List, int, java.util.List, java.util.List)}.
*
* NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public final class ShingleMatrixFilter extends TokenStream {
+ // final and unchangeable, used only as "marker state"
+ private static final AttributeSource REQUEST_NEXT_TOKEN = new AttributeSource();
+
public static Character defaultSpacerCharacter = Character.valueOf('_');
public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false;
@@ -128,41 +132,41 @@
public static abstract class TokenSettingsCodec {
/**
- * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+ * Retrieves information on how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
* @param token
* @return {@link ShingleMatrixFilter.TokenPositioner}
* @throws IOException
*/
- public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
+ public abstract TokenPositioner getTokenPositioner(AttributeSource token) throws IOException;
/**
- * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+ * Sets information on how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
*
* @param token
* @param tokenPositioner
*/
- public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner);
+ public abstract void setTokenPositioner(AttributeSource token, ShingleMatrixFilter.TokenPositioner tokenPositioner);
/**
* Have this method return 1f in order to 'disable' weights.
* @param token
* @return the weight of parameter token
*/
- public abstract float getWeight(Token token);
+ public abstract float getWeight(AttributeSource token);
/**
* Have this method do nothing in order to 'disable' weights.
* @param token
* @param weight
*/
- public abstract void setWeight(Token token, float weight);
+ public abstract void setWeight(AttributeSource token, float weight);
}
/**
- * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
- * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token)
- * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner)
+ * Used to describe how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+ * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(AttributeSource)
+ * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(AttributeSource,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner)
*/
public static class TokenPositioner {
public static final TokenPositioner newColumn = new TokenPositioner(0);
@@ -191,21 +195,11 @@
private Character spacerCharacter = defaultSpacerCharacter;
- private TokenStream input;
+ private final TokenStream input;
- private CharTermAttribute termAtt;
- private PositionIncrementAttribute posIncrAtt;
- private PayloadAttribute payloadAtt;
- private OffsetAttribute offsetAtt;
- private TypeAttribute typeAtt;
- private FlagsAttribute flagsAtt;
+ private Matrix matrix;
- private CharTermAttribute in_termAtt;
- private PositionIncrementAttribute in_posIncrAtt;
- private PayloadAttribute in_payloadAtt;
- private OffsetAttribute in_offsetAtt;
- private TypeAttribute in_typeAtt;
- private FlagsAttribute in_flagsAtt;
+ private AttributeSource reusableToken;
/**
@@ -222,6 +216,7 @@
* @param settingsCodec codec used to read input token weight and matrix positioning.
*/
public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
+ super();
this.matrix = matrix;
this.minimumShingleSize = minimumShingleSize;
this.maximumShingleSize = maximumShingleSize;
@@ -229,22 +224,18 @@
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
- termAtt = addAttribute(CharTermAttribute.class);
- posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- payloadAtt = addAttribute(PayloadAttribute.class);
- offsetAtt = addAttribute(OffsetAttribute.class);
- typeAtt = addAttribute(TypeAttribute.class);
- flagsAtt = addAttribute(FlagsAttribute.class);
-
// set the input to be an empty token stream, we already have the data.
this.input = new EmptyTokenStream();
- in_termAtt = input.addAttribute(CharTermAttribute.class);
- in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
- in_payloadAtt = input.addAttribute(PayloadAttribute.class);
- in_offsetAtt = input.addAttribute(OffsetAttribute.class);
- in_typeAtt = input.addAttribute(TypeAttribute.class);
- in_flagsAtt = input.addAttribute(FlagsAttribute.class);
+ addAttribute(CharTermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ addAttribute(PayloadAttribute.class);
+ addAttribute(OffsetAttribute.class);
+ addAttribute(TypeAttribute.class);
+ addAttribute(FlagsAttribute.class);
+
+ // lazy init token clones
+ reusableToken = cloneAttributes();
}
/**
@@ -305,25 +296,26 @@
* @param settingsCodec codec used to read input token weight and matrix positioning.
*/
public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
+ super(input.getAttributeFactory());
this.input = input;
this.minimumShingleSize = minimumShingleSize;
this.maximumShingleSize = maximumShingleSize;
this.spacerCharacter = spacerCharacter;
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
- termAtt = addAttribute(CharTermAttribute.class);
- posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- payloadAtt = addAttribute(PayloadAttribute.class);
- offsetAtt = addAttribute(OffsetAttribute.class);
- typeAtt = addAttribute(TypeAttribute.class);
- flagsAtt = addAttribute(FlagsAttribute.class);
- in_termAtt = input.addAttribute(CharTermAttribute.class);
- in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
- in_payloadAtt = input.addAttribute(PayloadAttribute.class);
- in_offsetAtt = input.addAttribute(OffsetAttribute.class);
- in_typeAtt = input.addAttribute(TypeAttribute.class);
- in_flagsAtt = input.addAttribute(FlagsAttribute.class);
+ input.addAttribute(CharTermAttribute.class);
+ input.addAttribute(PositionIncrementAttribute.class);
+ input.addAttribute(PayloadAttribute.class);
+ input.addAttribute(OffsetAttribute.class);
+ input.addAttribute(TypeAttribute.class);
+ input.addAttribute(FlagsAttribute.class);
+
+ Iterator> atts = input.getAttributeClassesIterator();
+ while (atts.hasNext()) // make sure all att impls in the token exist here
+ addAttribute(atts.next());
+ // lazy init token clones
+ reusableToken = cloneAttributes();
}
// internal filter instance variables
@@ -332,7 +324,7 @@
private Iterator permutations;
/** the current permutation of tokens used to produce shingles */
- private List currentPermuationTokens;
+ private List currentPermuationTokens;
/** index to what row a token in currentShingleTokens represents*/
private List currentPermutationRows;
@@ -343,7 +335,7 @@
* a set containing shingles that has been the result of a call to {@link #incrementToken()},
* used to avoid producing the same shingle more than once.
*/
- private Set> shinglesSeen = new HashSet>();
+ private Set> shinglesSeen = new HashSet>();
@Override
@@ -353,10 +345,6 @@
input.reset();
}
- private Matrix matrix;
-
- private Token reusableToken = new Token();
-
@Override
public final boolean incrementToken() throws IOException {
if (matrix == null) {
@@ -370,56 +358,32 @@
// this loop exists in order to avoid recursive calls to the next method
// as the complexity of a large matrix
// then would require a multi gigabyte sized stack.
- Token token;
+ AttributeSource token;
do {
token = produceNextToken(reusableToken);
- } while (token == request_next_token);
+ } while (token == REQUEST_NEXT_TOKEN);
if (token == null) return false;
clearAttributes();
- termAtt.copyBuffer(token.buffer(), 0, token.length());
- posIncrAtt.setPositionIncrement(token.getPositionIncrement());
- flagsAtt.setFlags(token.getFlags());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- typeAtt.setType(token.type());
- payloadAtt.setPayload(token.getPayload());
+ token.copyTo(this);
return true;
}
- private Token getNextInputToken(Token token) throws IOException {
+ private AttributeSource getNextInputToken() throws IOException {
if (!input.incrementToken()) return null;
- token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length());
- token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
- token.setFlags(in_flagsAtt.getFlags());
- token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
- token.setType(in_typeAtt.type());
- token.setPayload(in_payloadAtt.getPayload());
- return token;
+ return input.cloneAttributes();
}
- private Token getNextToken(Token token) throws IOException {
- if (!this.incrementToken()) return null;
- token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
- token.setPositionIncrement(posIncrAtt.getPositionIncrement());
- token.setFlags(flagsAtt.getFlags());
- token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
- token.setType(typeAtt.type());
- token.setPayload(payloadAtt.getPayload());
- return token;
- }
-
- private static final Token request_next_token = new Token();
-
/**
* This method exists in order to avoid recursive calls to the method
* as the complexity of a fairly small matrix then easily would require
* a gigabyte sized stack per thread.
*
* @param reusableToken
- * @return null if exhausted, instance request_next_token if one more call is required for an answer, or instance parameter resuableToken.
+ * @return null if exhausted, instance REQUEST_NEXT_TOKEN if one more call is required for an answer, or instance parameter resuableToken.
* @throws IOException
*/
- private Token produceNextToken(final Token reusableToken) throws IOException {
+ private AttributeSource produceNextToken(final AttributeSource reusableToken) throws IOException {
if (currentPermuationTokens != null) {
currentShingleLength++;
@@ -432,16 +396,18 @@
if (ignoringSinglePrefixOrSuffixShingle
&& currentShingleLength == 1
&& ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) {
- return getNextToken(reusableToken);
+ if (!this.incrementToken()) return null;
+ this.copyTo(reusableToken);
+ return reusableToken;
}
int termLength = 0;
- List shingle = new ArrayList(currentShingleLength);
+ List shingle = new ArrayList(currentShingleLength);
for (int i = 0; i < currentShingleLength; i++) {
- Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
- termLength += shingleToken.length();
+ AttributeSource shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
+ termLength += shingleToken.getAttribute(CharTermAttribute.class).length();
shingle.add(shingleToken);
}
if (spacerCharacter != null) {
@@ -450,18 +416,19 @@
// only produce shingles that not already has been created
if (!shinglesSeen.add(shingle)) {
- return request_next_token;
+ return REQUEST_NEXT_TOKEN;
}
// shingle token factory
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
- for (Token shingleToken : shingle) {
+ for (AttributeSource shingleToken : shingle) {
if (spacerCharacter != null && sb.length() > 0) {
sb.append(spacerCharacter);
}
- sb.append(shingleToken.buffer(), 0, shingleToken.length());
+ CharTermAttribute cta = shingleToken.getAttribute(CharTermAttribute.class);
+ sb.append(cta);
}
- reusableToken.setEmpty().append(sb);
+ reusableToken.getAttribute(CharTermAttribute.class).setEmpty().append(sb);
updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
return reusableToken;
@@ -474,7 +441,7 @@
// reset shingle size and move one step to the right in the current tokens permutation
currentPermutationTokensStartOffset++;
currentShingleLength = minimumShingleSize - 1;
- return request_next_token;
+ return REQUEST_NEXT_TOKEN;
}
@@ -498,16 +465,16 @@
Matrix.Column deletedColumn = matrix.columns.remove(0);
// remove all shingles seen that include any of the tokens from the deleted column.
- List deletedColumnTokens = new ArrayList();
+ List deletedColumnTokens = new ArrayList();
for (Matrix.Column.Row row : deletedColumn.getRows()) {
- for (Token token : row.getTokens()) {
+ for (AttributeSource token : row.getTokens()) {
deletedColumnTokens.add(token);
}
}
- for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
- List shingle = shinglesSeenIterator.next();
- for (Token deletedColumnToken : deletedColumnTokens) {
+ for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
+ List shingle = shinglesSeenIterator.next();
+ for (AttributeSource deletedColumnToken : deletedColumnTokens) {
if (shingle.contains(deletedColumnToken)) {
shinglesSeenIterator.remove();
break;
@@ -526,7 +493,7 @@
}
nextTokensPermutation();
- return request_next_token;
+ return REQUEST_NEXT_TOKEN;
}
}
@@ -541,7 +508,7 @@
nextTokensPermutation();
- return request_next_token;
+ return REQUEST_NEXT_TOKEN;
}
/**
@@ -553,9 +520,9 @@
private void nextTokensPermutation() {
Matrix.Column.Row[] rowsPermutation = permutations.next();
List currentPermutationRows = new ArrayList();
- List currentPermuationTokens = new ArrayList();
+ List currentPermuationTokens = new ArrayList();
for (Matrix.Column.Row row : rowsPermutation) {
- for (Token token : row.getTokens()) {
+ for (AttributeSource token : row.getTokens()) {
currentPermuationTokens.add(token);
currentPermutationRows.add(row);
}
@@ -579,12 +546,12 @@
* @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens
* @param currentPermuationTokens tokens of the current permutation of rows in the matrix.
*/
- public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) {
- token.setType(ShingleMatrixFilter.class.getName());
- token.setFlags(0);
- token.setPositionIncrement(1);
- token.setStartOffset(shingle.get(0).startOffset());
- token.setEndOffset(shingle.get(shingle.size() - 1).endOffset());
+ public void updateToken(AttributeSource token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) {
+ token.getAttribute(TypeAttribute.class).setType(ShingleMatrixFilter.class.getName());
+ token.getAttribute(FlagsAttribute.class).setFlags(0);
+ token.getAttribute(PositionIncrementAttribute.class).setPositionIncrement(1);
+ token.getAttribute(OffsetAttribute.class).setOffset(shingle.get(0).getAttribute(OffsetAttribute.class).startOffset(),
+ shingle.get(shingle.size() - 1).getAttribute(OffsetAttribute.class).endOffset());
settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens));
}
@@ -604,7 +571,7 @@
* @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle.
* @return weight to be set for parameter shingleToken
*/
- public float calculateShingleWeight(Token shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) {
+ public float calculateShingleWeight(AttributeSource shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) {
double[] weights = new double[shingle.size()];
double total = 0f;
@@ -632,7 +599,7 @@
}
- private Token readColumnBuf;
+ private AttributeSource readColumnBuf;
/**
* Loads one column from the token stream.
@@ -644,12 +611,12 @@
*/
private boolean readColumn() throws IOException {
- Token token;
+ AttributeSource token;
if (readColumnBuf != null) {
token = readColumnBuf;
readColumnBuf = null;
} else {
- token = getNextInputToken(new Token());
+ token = getNextInputToken();
}
if (token == null) {
@@ -661,7 +628,7 @@
currentReaderRow.getTokens().add(token);
TokenPositioner tokenPositioner;
- while ((readColumnBuf = getNextInputToken(new Token())) != null
+ while ((readColumnBuf = getNextInputToken()) != null
&& (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
if (tokenPositioner == TokenPositioner.sameRow) {
@@ -675,7 +642,7 @@
}
if (readColumnBuf == null) {
- readColumnBuf = getNextInputToken(new Token());
+ readColumnBuf = getNextInputToken();
if (readColumnBuf == null) {
currentReaderColumn.setLast(true);
}
@@ -720,7 +687,7 @@
return Matrix.this;
}
- public Column(Token token) {
+ public Column(AttributeSource token) {
this();
Row row = new Row();
row.getTokens().add(token);
@@ -778,7 +745,7 @@
return Column.this;
}
- private List tokens = new LinkedList();
+ private List tokens = new LinkedList();
public Row() {
Column.this.rows.add(this);
@@ -788,11 +755,11 @@
return Column.this.rows.indexOf(this);
}
- public List getTokens() {
+ public List getTokens() {
return tokens;
}
- public void setTokens(List tokens) {
+ public void setTokens(List tokens) {
this.tokens = tokens;
}
@@ -928,21 +895,21 @@
public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec {
@Override
- public TokenPositioner getTokenPositioner(Token token) throws IOException {
+ public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException {
return TokenPositioner.newColumn;
}
@Override
- public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
+ public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) {
}
@Override
- public float getWeight(Token token) {
+ public float getWeight(AttributeSource token) {
return 1f;
}
@Override
- public void setWeight(Token token, float weight) {
+ public void setWeight(AttributeSource token, float weight) {
}
}
@@ -956,8 +923,8 @@
public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec {
@Override
- public TokenPositioner getTokenPositioner(Token token) throws IOException {
- if (token.getPositionIncrement() == 0) {
+ public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException {
+ if (token.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() == 0) {
return TokenPositioner.newRow;
} else {
return TokenPositioner.newColumn;
@@ -965,17 +932,17 @@
}
@Override
- public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
+ public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) {
throw new UnsupportedOperationException();
}
@Override
- public float getWeight(Token token) {
+ public float getWeight(AttributeSource token) {
return 1f;
}
@Override
- public void setWeight(Token token, float weight) {
+ public void setWeight(AttributeSource token, float weight) {
}
}
@@ -997,8 +964,8 @@
* @throws IOException
*/
@Override
- public TokenPositioner getTokenPositioner(Token token) throws IOException {
- switch (token.getFlags()) {
+ public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException {
+ switch (token.addAttribute(FlagsAttribute.class).getFlags()) {
case 0:
return TokenPositioner.newColumn;
case 1:
@@ -1016,8 +983,8 @@
* @param tokenPositioner
*/
@Override
- public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
- token.setFlags(tokenPositioner.getIndex());
+ public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) {
+ token.addAttribute(FlagsAttribute.class).setFlags(tokenPositioner.getIndex());
}
/**
@@ -1027,11 +994,12 @@
* @return 32 bit float
*/
@Override
- public float getWeight(Token token) {
- if (token.getPayload() == null || token.getPayload().getData() == null) {
+ public float getWeight(AttributeSource token) {
+ final PayloadAttribute payloadAtt = token.getAttribute(PayloadAttribute.class);
+ if (payloadAtt.getPayload() == null || payloadAtt.getPayload().getData() == null) {
return 1f;
} else {
- return PayloadHelper.decodeFloat(token.getPayload().getData());
+ return PayloadHelper.decodeFloat(payloadAtt.getPayload().getData());
}
}
@@ -1041,11 +1009,11 @@
* @param weight
*/
@Override
- public void setWeight(Token token, float weight) {
+ public void setWeight(AttributeSource token, float weight) {
if (weight == 1f) {
- token.setPayload(null);
+ token.addAttribute(PayloadAttribute.class).setPayload(null);
} else {
- token.setPayload(new Payload(PayloadHelper.encodeFloat(weight)));
+ token.addAttribute(PayloadAttribute.class).setPayload(new Payload(PayloadHelper.encodeFloat(weight)));
}
}
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 1067173)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (working copy)
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
+import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
@@ -27,7 +28,6 @@
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
-import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
@@ -37,6 +37,8 @@
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeSource;
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
@@ -66,11 +68,11 @@
assertFalse(ts.incrementToken());
TokenListStream tls;
- LinkedList tokens;
+ LinkedList tokens;
// test a plain old token stream with synonyms translated to rows.
- tokens = new LinkedList();
+ tokens = new LinkedList();
tokens.add(createToken("please", 0, 6));
tokens.add(createToken("divide", 7, 13));
tokens.add(createToken("this", 14, 18));
@@ -102,11 +104,11 @@
TokenStream ts;
TokenStream tls;
- LinkedList tokens;
+ LinkedList tokens;
// test a plain old token stream with synonyms tranlated to rows.
- tokens = new LinkedList();
+ tokens = new LinkedList();
tokens.add(tokenFactory("hello", 1, 0, 4));
tokens.add(tokenFactory("greetings", 0, 0, 4));
tokens.add(tokenFactory("world", 1, 5, 10));
@@ -146,7 +148,7 @@
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
- tokens = new LinkedList();
+ tokens = new LinkedList();
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
@@ -155,14 +157,15 @@
tls = new TokenListStream(tokens);
- ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
+ ts = new PrefixAndSuffixAwareTokenFilter(new TokenListStream(Collections.singleton(tokenFactory("^", 1, 100f, 0, 0))),
+ tls, new TokenListStream(Collections.singleton(tokenFactory("$", 1, 50f, 0, 0))));
tls = new CachingTokenFilter(ts);
// bi-grams, position incrememnt, weight, start offset, end offset
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false);
//
-// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
@@ -185,7 +188,7 @@
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), false);
//
-// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
@@ -233,7 +236,7 @@
tls.reset();
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), true);
-// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
@@ -280,14 +283,14 @@
// multi-token synonyms
//
- // Token[][][] {
+ // AttributeSource[][][] {
// {{hello}, {greetings, and, salutations},
// {{world}, {earth}, {tellus}}
// }
//
- tokens = new LinkedList();
+ tokens = new LinkedList();
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
@@ -302,7 +305,7 @@
ts = new ShingleMatrixFilter(tls, 2, 3, new Character('_'), false);
-// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
@@ -365,7 +368,7 @@
TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, new Character('_'), true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
-// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
@@ -414,30 +417,33 @@
}
- private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
- Token token = new Token(startOffset, endOffset);
- token.setEmpty().append(text);
- token.setPositionIncrement(posIncr);
+ private AttributeSource tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
+ AttributeSource token = new AttributeSource();
+ token.addAttribute(CharTermAttribute.class).append(text);
+ token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr);
+ token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset);
return token;
}
- private Token tokenFactory(String text, int posIncr) {
+ private AttributeSource tokenFactory(String text, int posIncr) {
return tokenFactory(text, posIncr, 1f, 0, 0);
}
- private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
- Token token = new Token(startOffset, endOffset);
- token.setEmpty().append(text);
- token.setPositionIncrement(posIncr);
+ private AttributeSource tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
+ AttributeSource token = new AttributeSource();
+ token.addAttribute(CharTermAttribute.class).append(text);
+ token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr);
+ token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
return token;
}
- private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
- Token token = new Token(startOffset, endOffset);
- token.setEmpty().append(text);
- token.setPositionIncrement(posIncr);
+ private AttributeSource tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
+ AttributeSource token = new AttributeSource();
+ token.addAttribute(CharTermAttribute.class).append(text);
+ token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr);
+ token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
return token;
@@ -476,29 +482,30 @@
assertEquals(endOffset, offsetAtt.endOffset());
}
- private static Token createToken(String term, int start, int offset)
+ private static AttributeSource createToken(String term, int start, int offset)
{
- Token token = new Token(start, offset);
- token.setEmpty().append(term);
+ AttributeSource token = new AttributeSource();
+ token.addAttribute(CharTermAttribute.class).append(term);
+ token.addAttribute(OffsetAttribute.class).setOffset(start, offset);
return token;
}
public final static class TokenListStream extends TokenStream {
- private Collection tokens;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private Collection tokens;
- public TokenListStream(Collection tokens) {
+ public TokenListStream(Collection tokens) {
this.tokens = tokens;
- }
+ addAttribute(CharTermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ addAttribute(PayloadAttribute.class);
+ addAttribute(OffsetAttribute.class);
+ addAttribute(TypeAttribute.class);
+ addAttribute(FlagsAttribute.class);
+ }
- private Iterator iterator;
+ private Iterator iterator;
@Override
public boolean incrementToken() throws IOException {
@@ -508,14 +515,12 @@
if (!iterator.hasNext()) {
return false;
}
- Token prototype = iterator.next();
+ AttributeSource prototype = iterator.next();
clearAttributes();
- termAtt.copyBuffer(prototype.buffer(), 0, prototype.length());
- posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
- flagsAtt.setFlags(prototype.getFlags());
- offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
- typeAtt.setType(prototype.type());
- payloadAtt.setPayload(prototype.getPayload());
+ Iterator> atts = prototype.getAttributeClassesIterator();
+ while (atts.hasNext()) // make sure all att impls in the token exist here
+ addAttribute(atts.next());
+ prototype.copyTo(this);
return true;
}