From 8acf1310cdac21654e06449ae7cc1e777c113ba3 Mon Sep 17 00:00:00 2001 From: Scott Smerchek Date: Fri, 25 Jan 2013 14:54:47 -0600 Subject: [PATCH] Added the PayloadAttribute to the generated tokens of the WordDelimeterFilter. This appears to be a regression from the fix in SOLR-532 which was a bug in 1.4. --- .../miscellaneous/WordDelimiterFilter.java | 7 +++++ .../miscellaneous/TestWordDelimiterFilter.java | 30 ++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index 38dba6e..bba18a2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -23,8 +23,10 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; @@ -144,6 +146,7 @@ public final class WordDelimiterFilter extends TokenFilter { private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + private final PayloadAttribute payloadAttribute = addAttribute(PayloadAttribute.class); // used for iterating word delimiter breaks private final WordDelimiterIterator iterator; @@ -162,6 +165,7 @@ public final class WordDelimiterFilter extends TokenFilter { private char savedBuffer[] = new char[1024]; private int savedStartOffset; private int savedEndOffset; + private BytesRef savedPayload; private String savedType; private boolean hasSavedState = false; // if length by start + end offsets doesn't match the term text then assume @@ -337,6 +341,7 @@ public final class WordDelimiterFilter extends TokenFilter { // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets. hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); savedType = typeAttribute.type(); + savedPayload = payloadAttribute.getPayload(); if (savedBuffer.length < termAttribute.length()) { savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)]; @@ -422,6 +427,7 @@ public final class WordDelimiterFilter extends TokenFilter { } posIncAttribute.setPositionIncrement(position(false)); typeAttribute.setType(savedType); + payloadAttribute.setPayload(savedPayload); } /** @@ -545,6 +551,7 @@ public final class WordDelimiterFilter extends TokenFilter { } posIncAttribute.setPositionIncrement(position(true)); typeAttribute.setType(savedType); + payloadAttribute.setPayload(savedPayload); accumPosInc = 0; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index bb8d86d..259c260 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -18,14 +18,15 @@ package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.cz.CzechStemFilter; +import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.BytesRef; import org.junit.Test; import java.io.IOException; @@ -128,6 +129,31 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { null, null, null, null, false); } + @Test + public void testPayloadKept() throws Exception { + int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; + Token tok = new Token("(foo,bar)", 7, 16); + tok.setPayload(new BytesRef(PayloadHelper.encodeInt(1))); + WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(tok), DEFAULT_WORD_DELIM_TABLE, flags, null); + wdf.addAttribute(PayloadAttribute.class); + + while (wdf.incrementToken()) { + assertEquals(tok.getPayload(), wdf.getAttribute(PayloadAttribute.class).getPayload()); + } + } + + @Test + public void testNullPayloadKept() throws Exception { + int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; + Token tok = new Token("(foo,bar)", 7, 16); + WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(tok), DEFAULT_WORD_DELIM_TABLE, flags, null); + wdf.addAttribute(PayloadAttribute.class); + + while (wdf.incrementToken()) { + assertEquals("generated token " + wdf.getAttribute(CharTermAttribute.class).toString(), tok.getPayload(), wdf.getAttribute(PayloadAttribute.class).getPayload()); + } + } + public void doSplit(final String input, String... output) throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer( -- 1.7.11.msysgit.1