Index: modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 1067173) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy) @@ -26,7 +26,6 @@ import java.util.NoSuchElementException; import java.util.Set; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.payloads.PayloadHelper; @@ -38,6 +37,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; /** @@ -110,13 +111,16 @@ * *

The filter also has basic support for calculating weights for the shingles * based on the weights of the tokens from the input stream, output shingle size, etc. - * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}. + * See {@link #calculateShingleWeight(AttributeSource, java.util.List, int, java.util.List, java.util.List)}. *

* NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than * the ones located in org.apache.lucene.analysis.tokenattributes. */ public final class ShingleMatrixFilter extends TokenStream { + // final and unchangeable, used only as "marker state" + private static final AttributeSource REQUEST_NEXT_TOKEN = new AttributeSource(); + public static Character defaultSpacerCharacter = Character.valueOf('_'); public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec(); public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false; @@ -128,41 +132,41 @@ public static abstract class TokenSettingsCodec { /** - * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * Retrieves information on how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * @param token * @return {@link ShingleMatrixFilter.TokenPositioner} * @throws IOException */ - public abstract TokenPositioner getTokenPositioner(Token token) throws IOException; + public abstract TokenPositioner getTokenPositioner(AttributeSource token) throws IOException; /** - * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * Sets information on how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * * @param token * @param tokenPositioner */ - public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner); + public abstract void setTokenPositioner(AttributeSource token, ShingleMatrixFilter.TokenPositioner tokenPositioner); /** * Have this method return 1f in order to 'disable' weights. * @param token * @return the weight of parameter token */ - public abstract float getWeight(Token token); + public abstract float getWeight(AttributeSource token); /** * Have this method do nothing in order to 'disable' weights. * @param token * @param weight */ - public abstract void setWeight(Token token, float weight); + public abstract void setWeight(AttributeSource token, float weight); } /** - * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. - * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token) - * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) + * Used to describe how a {@link AttributeSource} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(AttributeSource) + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(AttributeSource,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) */ public static class TokenPositioner { public static final TokenPositioner newColumn = new TokenPositioner(0); @@ -191,21 +195,11 @@ private Character spacerCharacter = defaultSpacerCharacter; - private TokenStream input; + private final TokenStream input; - private CharTermAttribute termAtt; - private PositionIncrementAttribute posIncrAtt; - private PayloadAttribute payloadAtt; - private OffsetAttribute offsetAtt; - private TypeAttribute typeAtt; - private FlagsAttribute flagsAtt; + private Matrix matrix; - private CharTermAttribute in_termAtt; - private PositionIncrementAttribute in_posIncrAtt; - private PayloadAttribute in_payloadAtt; - private OffsetAttribute in_offsetAtt; - private TypeAttribute in_typeAtt; - private FlagsAttribute in_flagsAtt; + private AttributeSource reusableToken; /** @@ -222,6 +216,7 @@ * @param settingsCodec codec used to read input token weight and matrix positioning. */ public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + super(); this.matrix = matrix; this.minimumShingleSize = minimumShingleSize; this.maximumShingleSize = maximumShingleSize; @@ -229,22 +224,18 @@ this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; this.settingsCodec = settingsCodec; - termAtt = addAttribute(CharTermAttribute.class); - posIncrAtt = addAttribute(PositionIncrementAttribute.class); - payloadAtt = addAttribute(PayloadAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - typeAtt = addAttribute(TypeAttribute.class); - flagsAtt = addAttribute(FlagsAttribute.class); - // set the input to be an empty token stream, we already have the data. this.input = new EmptyTokenStream(); - in_termAtt = input.addAttribute(CharTermAttribute.class); - in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); - in_payloadAtt = input.addAttribute(PayloadAttribute.class); - in_offsetAtt = input.addAttribute(OffsetAttribute.class); - in_typeAtt = input.addAttribute(TypeAttribute.class); - in_flagsAtt = input.addAttribute(FlagsAttribute.class); + addAttribute(CharTermAttribute.class); + addAttribute(PositionIncrementAttribute.class); + addAttribute(PayloadAttribute.class); + addAttribute(OffsetAttribute.class); + addAttribute(TypeAttribute.class); + addAttribute(FlagsAttribute.class); + + // lazy init token clones + reusableToken = cloneAttributes(); } /** @@ -305,25 +296,26 @@ * @param settingsCodec codec used to read input token weight and matrix positioning. */ public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + super(input.getAttributeFactory()); this.input = input; this.minimumShingleSize = minimumShingleSize; this.maximumShingleSize = maximumShingleSize; this.spacerCharacter = spacerCharacter; this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; this.settingsCodec = settingsCodec; - termAtt = addAttribute(CharTermAttribute.class); - posIncrAtt = addAttribute(PositionIncrementAttribute.class); - payloadAtt = addAttribute(PayloadAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - typeAtt = addAttribute(TypeAttribute.class); - flagsAtt = addAttribute(FlagsAttribute.class); - in_termAtt = input.addAttribute(CharTermAttribute.class); - in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); - in_payloadAtt = input.addAttribute(PayloadAttribute.class); - in_offsetAtt = input.addAttribute(OffsetAttribute.class); - in_typeAtt = input.addAttribute(TypeAttribute.class); - in_flagsAtt = input.addAttribute(FlagsAttribute.class); + input.addAttribute(CharTermAttribute.class); + input.addAttribute(PositionIncrementAttribute.class); + input.addAttribute(PayloadAttribute.class); + input.addAttribute(OffsetAttribute.class); + input.addAttribute(TypeAttribute.class); + input.addAttribute(FlagsAttribute.class); + + Iterator> atts = input.getAttributeClassesIterator(); + while (atts.hasNext()) // make sure all att impls in the token exist here + addAttribute(atts.next()); + // lazy init token clones + reusableToken = cloneAttributes(); } // internal filter instance variables @@ -332,7 +324,7 @@ private Iterator permutations; /** the current permutation of tokens used to produce shingles */ - private List currentPermuationTokens; + private List currentPermuationTokens; /** index to what row a token in currentShingleTokens represents*/ private List currentPermutationRows; @@ -343,7 +335,7 @@ * a set containing shingles that has been the result of a call to {@link #incrementToken()}, * used to avoid producing the same shingle more than once. */ - private Set> shinglesSeen = new HashSet>(); + private Set> shinglesSeen = new HashSet>(); @Override @@ -353,10 +345,6 @@ input.reset(); } - private Matrix matrix; - - private Token reusableToken = new Token(); - @Override public final boolean incrementToken() throws IOException { if (matrix == null) { @@ -370,56 +358,32 @@ // this loop exists in order to avoid recursive calls to the next method // as the complexity of a large matrix // then would require a multi gigabyte sized stack. - Token token; + AttributeSource token; do { token = produceNextToken(reusableToken); - } while (token == request_next_token); + } while (token == REQUEST_NEXT_TOKEN); if (token == null) return false; clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - posIncrAtt.setPositionIncrement(token.getPositionIncrement()); - flagsAtt.setFlags(token.getFlags()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - typeAtt.setType(token.type()); - payloadAtt.setPayload(token.getPayload()); + token.copyTo(this); return true; } - private Token getNextInputToken(Token token) throws IOException { + private AttributeSource getNextInputToken() throws IOException { if (!input.incrementToken()) return null; - token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length()); - token.setPositionIncrement(in_posIncrAtt.getPositionIncrement()); - token.setFlags(in_flagsAtt.getFlags()); - token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset()); - token.setType(in_typeAtt.type()); - token.setPayload(in_payloadAtt.getPayload()); - return token; + return input.cloneAttributes(); } - private Token getNextToken(Token token) throws IOException { - if (!this.incrementToken()) return null; - token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - token.setPositionIncrement(posIncrAtt.getPositionIncrement()); - token.setFlags(flagsAtt.getFlags()); - token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); - token.setType(typeAtt.type()); - token.setPayload(payloadAtt.getPayload()); - return token; - } - - private static final Token request_next_token = new Token(); - /** * This method exists in order to avoid recursive calls to the method * as the complexity of a fairly small matrix then easily would require * a gigabyte sized stack per thread. * * @param reusableToken - * @return null if exhausted, instance request_next_token if one more call is required for an answer, or instance parameter resuableToken. + * @return null if exhausted, instance REQUEST_NEXT_TOKEN if one more call is required for an answer, or instance parameter resuableToken. * @throws IOException */ - private Token produceNextToken(final Token reusableToken) throws IOException { + private AttributeSource produceNextToken(final AttributeSource reusableToken) throws IOException { if (currentPermuationTokens != null) { currentShingleLength++; @@ -432,16 +396,18 @@ if (ignoringSinglePrefixOrSuffixShingle && currentShingleLength == 1 && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { - return getNextToken(reusableToken); + if (!this.incrementToken()) return null; + this.copyTo(reusableToken); + return reusableToken; } int termLength = 0; - List shingle = new ArrayList(currentShingleLength); + List shingle = new ArrayList(currentShingleLength); for (int i = 0; i < currentShingleLength; i++) { - Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); - termLength += shingleToken.length(); + AttributeSource shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); + termLength += shingleToken.getAttribute(CharTermAttribute.class).length(); shingle.add(shingleToken); } if (spacerCharacter != null) { @@ -450,18 +416,19 @@ // only produce shingles that not already has been created if (!shinglesSeen.add(shingle)) { - return request_next_token; + return REQUEST_NEXT_TOKEN; } // shingle token factory StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. - for (Token shingleToken : shingle) { + for (AttributeSource shingleToken : shingle) { if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } - sb.append(shingleToken.buffer(), 0, shingleToken.length()); + CharTermAttribute cta = shingleToken.getAttribute(CharTermAttribute.class); + sb.append(cta); } - reusableToken.setEmpty().append(sb); + reusableToken.getAttribute(CharTermAttribute.class).setEmpty().append(sb); updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); return reusableToken; @@ -474,7 +441,7 @@ // reset shingle size and move one step to the right in the current tokens permutation currentPermutationTokensStartOffset++; currentShingleLength = minimumShingleSize - 1; - return request_next_token; + return REQUEST_NEXT_TOKEN; } @@ -498,16 +465,16 @@ Matrix.Column deletedColumn = matrix.columns.remove(0); // remove all shingles seen that include any of the tokens from the deleted column. - List deletedColumnTokens = new ArrayList(); + List deletedColumnTokens = new ArrayList(); for (Matrix.Column.Row row : deletedColumn.getRows()) { - for (Token token : row.getTokens()) { + for (AttributeSource token : row.getTokens()) { deletedColumnTokens.add(token); } } - for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { - List shingle = shinglesSeenIterator.next(); - for (Token deletedColumnToken : deletedColumnTokens) { + for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { + List shingle = shinglesSeenIterator.next(); + for (AttributeSource deletedColumnToken : deletedColumnTokens) { if (shingle.contains(deletedColumnToken)) { shinglesSeenIterator.remove(); break; @@ -526,7 +493,7 @@ } nextTokensPermutation(); - return request_next_token; + return REQUEST_NEXT_TOKEN; } } @@ -541,7 +508,7 @@ nextTokensPermutation(); - return request_next_token; + return REQUEST_NEXT_TOKEN; } /** @@ -553,9 +520,9 @@ private void nextTokensPermutation() { Matrix.Column.Row[] rowsPermutation = permutations.next(); List currentPermutationRows = new ArrayList(); - List currentPermuationTokens = new ArrayList(); + List currentPermuationTokens = new ArrayList(); for (Matrix.Column.Row row : rowsPermutation) { - for (Token token : row.getTokens()) { + for (AttributeSource token : row.getTokens()) { currentPermuationTokens.add(token); currentPermutationRows.add(row); } @@ -579,12 +546,12 @@ * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens * @param currentPermuationTokens tokens of the current permutation of rows in the matrix. */ - public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { - token.setType(ShingleMatrixFilter.class.getName()); - token.setFlags(0); - token.setPositionIncrement(1); - token.setStartOffset(shingle.get(0).startOffset()); - token.setEndOffset(shingle.get(shingle.size() - 1).endOffset()); + public void updateToken(AttributeSource token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + token.getAttribute(TypeAttribute.class).setType(ShingleMatrixFilter.class.getName()); + token.getAttribute(FlagsAttribute.class).setFlags(0); + token.getAttribute(PositionIncrementAttribute.class).setPositionIncrement(1); + token.getAttribute(OffsetAttribute.class).setOffset(shingle.get(0).getAttribute(OffsetAttribute.class).startOffset(), + shingle.get(shingle.size() - 1).getAttribute(OffsetAttribute.class).endOffset()); settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)); } @@ -604,7 +571,7 @@ * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle. * @return weight to be set for parameter shingleToken */ - public float calculateShingleWeight(Token shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + public float calculateShingleWeight(AttributeSource shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { double[] weights = new double[shingle.size()]; double total = 0f; @@ -632,7 +599,7 @@ } - private Token readColumnBuf; + private AttributeSource readColumnBuf; /** * Loads one column from the token stream. @@ -644,12 +611,12 @@ */ private boolean readColumn() throws IOException { - Token token; + AttributeSource token; if (readColumnBuf != null) { token = readColumnBuf; readColumnBuf = null; } else { - token = getNextInputToken(new Token()); + token = getNextInputToken(); } if (token == null) { @@ -661,7 +628,7 @@ currentReaderRow.getTokens().add(token); TokenPositioner tokenPositioner; - while ((readColumnBuf = getNextInputToken(new Token())) != null + while ((readColumnBuf = getNextInputToken()) != null && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) { if (tokenPositioner == TokenPositioner.sameRow) { @@ -675,7 +642,7 @@ } if (readColumnBuf == null) { - readColumnBuf = getNextInputToken(new Token()); + readColumnBuf = getNextInputToken(); if (readColumnBuf == null) { currentReaderColumn.setLast(true); } @@ -720,7 +687,7 @@ return Matrix.this; } - public Column(Token token) { + public Column(AttributeSource token) { this(); Row row = new Row(); row.getTokens().add(token); @@ -778,7 +745,7 @@ return Column.this; } - private List tokens = new LinkedList(); + private List tokens = new LinkedList(); public Row() { Column.this.rows.add(this); @@ -788,11 +755,11 @@ return Column.this.rows.indexOf(this); } - public List getTokens() { + public List getTokens() { return tokens; } - public void setTokens(List tokens) { + public void setTokens(List tokens) { this.tokens = tokens; } @@ -928,21 +895,21 @@ public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec { @Override - public TokenPositioner getTokenPositioner(Token token) throws IOException { + public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException { return TokenPositioner.newColumn; } @Override - public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) { } @Override - public float getWeight(Token token) { + public float getWeight(AttributeSource token) { return 1f; } @Override - public void setWeight(Token token, float weight) { + public void setWeight(AttributeSource token, float weight) { } } @@ -956,8 +923,8 @@ public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec { @Override - public TokenPositioner getTokenPositioner(Token token) throws IOException { - if (token.getPositionIncrement() == 0) { + public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException { + if (token.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() == 0) { return TokenPositioner.newRow; } else { return TokenPositioner.newColumn; @@ -965,17 +932,17 @@ } @Override - public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) { throw new UnsupportedOperationException(); } @Override - public float getWeight(Token token) { + public float getWeight(AttributeSource token) { return 1f; } @Override - public void setWeight(Token token, float weight) { + public void setWeight(AttributeSource token, float weight) { } } @@ -997,8 +964,8 @@ * @throws IOException */ @Override - public TokenPositioner getTokenPositioner(Token token) throws IOException { - switch (token.getFlags()) { + public TokenPositioner getTokenPositioner(AttributeSource token) throws IOException { + switch (token.addAttribute(FlagsAttribute.class).getFlags()) { case 0: return TokenPositioner.newColumn; case 1: @@ -1016,8 +983,8 @@ * @param tokenPositioner */ @Override - public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { - token.setFlags(tokenPositioner.getIndex()); + public void setTokenPositioner(AttributeSource token, TokenPositioner tokenPositioner) { + token.addAttribute(FlagsAttribute.class).setFlags(tokenPositioner.getIndex()); } /** @@ -1027,11 +994,12 @@ * @return 32 bit float */ @Override - public float getWeight(Token token) { - if (token.getPayload() == null || token.getPayload().getData() == null) { + public float getWeight(AttributeSource token) { + final PayloadAttribute payloadAtt = token.getAttribute(PayloadAttribute.class); + if (payloadAtt.getPayload() == null || payloadAtt.getPayload().getData() == null) { return 1f; } else { - return PayloadHelper.decodeFloat(token.getPayload().getData()); + return PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()); } } @@ -1041,11 +1009,11 @@ * @param weight */ @Override - public void setWeight(Token token, float weight) { + public void setWeight(AttributeSource token, float weight) { if (weight == 1f) { - token.setPayload(null); + token.addAttribute(PayloadAttribute.class).setPayload(null); } else { - token.setPayload(new Payload(PayloadHelper.encodeFloat(weight))); + token.addAttribute(PayloadAttribute.class).setPayload(new Payload(PayloadHelper.encodeFloat(weight))); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 1067173) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.Collection; +import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; @@ -27,7 +28,6 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter; -import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix; import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column; @@ -37,6 +37,8 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; public class TestShingleMatrixFilter extends BaseTokenStreamTestCase { @@ -66,11 +68,11 @@ assertFalse(ts.incrementToken()); TokenListStream tls; - LinkedList tokens; + LinkedList tokens; // test a plain old token stream with synonyms translated to rows. - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(createToken("please", 0, 6)); tokens.add(createToken("divide", 7, 13)); tokens.add(createToken("this", 14, 18)); @@ -102,11 +104,11 @@ TokenStream ts; TokenStream tls; - LinkedList tokens; + LinkedList tokens; // test a plain old token stream with synonyms tranlated to rows. - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 0, 4)); tokens.add(tokenFactory("greetings", 0, 0, 4)); tokens.add(tokenFactory("world", 1, 5, 10)); @@ -146,7 +148,7 @@ ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn)); tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow)); tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn)); @@ -155,14 +157,15 @@ tls = new TokenListStream(tokens); - ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0))); + ts = new PrefixAndSuffixAwareTokenFilter(new TokenListStream(Collections.singleton(tokenFactory("^", 1, 100f, 0, 0))), + tls, new TokenListStream(Collections.singleton(tokenFactory("$", 1, 50f, 0, 0)))); tls = new CachingTokenFilter(ts); // bi-grams, position incrememnt, weight, start offset, end offset ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false); // -// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } @@ -185,7 +188,7 @@ ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), false); // -// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } @@ -233,7 +236,7 @@ tls.reset(); ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), true); -// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } @@ -280,14 +283,14 @@ // multi-token synonyms // - // Token[][][] { + // AttributeSource[][][] { // {{hello}, {greetings, and, salutations}, // {{world}, {earth}, {tellus}} // } // - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn)); tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow)); tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow)); @@ -302,7 +305,7 @@ ts = new ShingleMatrixFilter(tls, 2, 3, new Character('_'), false); -// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } @@ -365,7 +368,7 @@ TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, new Character('_'), true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec()); -// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// for (AttributeSource token = ts.next(new AttributeSource()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } @@ -414,30 +417,33 @@ } - private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) { - Token token = new Token(startOffset, endOffset); - token.setEmpty().append(text); - token.setPositionIncrement(posIncr); + private AttributeSource tokenFactory(String text, int posIncr, int startOffset, int endOffset) { + AttributeSource token = new AttributeSource(); + token.addAttribute(CharTermAttribute.class).append(text); + token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr); + token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset); return token; } - private Token tokenFactory(String text, int posIncr) { + private AttributeSource tokenFactory(String text, int posIncr) { return tokenFactory(text, posIncr, 1f, 0, 0); } - private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) { - Token token = new Token(startOffset, endOffset); - token.setEmpty().append(text); - token.setPositionIncrement(posIncr); + private AttributeSource tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) { + AttributeSource token = new AttributeSource(); + token.addAttribute(CharTermAttribute.class).append(text); + token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr); + token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); return token; } - private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) { - Token token = new Token(startOffset, endOffset); - token.setEmpty().append(text); - token.setPositionIncrement(posIncr); + private AttributeSource tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) { + AttributeSource token = new AttributeSource(); + token.addAttribute(CharTermAttribute.class).append(text); + token.addAttribute(PositionIncrementAttribute.class).setPositionIncrement(posIncr); + token.addAttribute(OffsetAttribute.class).setOffset(startOffset, endOffset); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner); return token; @@ -476,29 +482,30 @@ assertEquals(endOffset, offsetAtt.endOffset()); } - private static Token createToken(String term, int start, int offset) + private static AttributeSource createToken(String term, int start, int offset) { - Token token = new Token(start, offset); - token.setEmpty().append(term); + AttributeSource token = new AttributeSource(); + token.addAttribute(CharTermAttribute.class).append(term); + token.addAttribute(OffsetAttribute.class).setOffset(start, offset); return token; } public final static class TokenListStream extends TokenStream { - private Collection tokens; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + private Collection tokens; - public TokenListStream(Collection tokens) { + public TokenListStream(Collection tokens) { this.tokens = tokens; - } + addAttribute(CharTermAttribute.class); + addAttribute(PositionIncrementAttribute.class); + addAttribute(PayloadAttribute.class); + addAttribute(OffsetAttribute.class); + addAttribute(TypeAttribute.class); + addAttribute(FlagsAttribute.class); + } - private Iterator iterator; + private Iterator iterator; @Override public boolean incrementToken() throws IOException { @@ -508,14 +515,12 @@ if (!iterator.hasNext()) { return false; } - Token prototype = iterator.next(); + AttributeSource prototype = iterator.next(); clearAttributes(); - termAtt.copyBuffer(prototype.buffer(), 0, prototype.length()); - posIncrAtt.setPositionIncrement(prototype.getPositionIncrement()); - flagsAtt.setFlags(prototype.getFlags()); - offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset()); - typeAtt.setType(prototype.type()); - payloadAtt.setPayload(prototype.getPayload()); + Iterator> atts = prototype.getAttributeClassesIterator(); + while (atts.hasNext()) // make sure all att impls in the token exist here + addAttribute(atts.next()); + prototype.copyTo(this); return true; }