Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 806985) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy) @@ -19,14 +19,10 @@ import java.io.IOException; +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * A {@link TokenStream} containing a single token. @@ -34,45 +30,36 @@ public class SingleTokenTokenStream extends TokenStream { private boolean exhausted = false; + // The token needs to be immutable, so work with clones! private Token singleToken; + private final AttributeImpl tokenAtt; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; - private FlagsAttribute flagsAtt; - private PositionIncrementAttribute posIncAtt; - private TypeAttribute typeAtt; - private PayloadAttribute payloadAtt; + private static final AttributeFactory TOKEN_ATTRIBUTE_FACTORY = new AttributeFactory() { + public AttributeImpl createAttributeInstance(Class attClass) { + return attClass.isAssignableFrom(Token.class) + ? new Token() : DEFAULT_ATTRIBUTE_FACTORY.createAttributeInstance(attClass); + } + }; public SingleTokenTokenStream(Token token) { + super(TOKEN_ATTRIBUTE_FACTORY); + assert token != null; this.singleToken = (Token) token.clone(); - - termAtt = (TermAttribute) addAttribute(TermAttribute.class); - offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); - posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); - typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); - payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + + tokenAtt = (AttributeImpl) addAttribute(TermAttribute.class); + assert (tokenAtt instanceof Token || tokenAtt.getClass().getName().equals("org.apache.lucene.analysis.TokenWrapper")); } - public final boolean incrementToken() throws IOException { if (exhausted) { return false; + } else { + singleToken.copyTo(tokenAtt); + exhausted = true; + return true; } - - Token clone = (Token) singleToken.clone(); - - clearAttributes(); - termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength()); - offsetAtt.setOffset(clone.startOffset(), clone.endOffset()); - flagsAtt.setFlags(clone.getFlags()); - typeAtt.setType(clone.type()); - posIncAtt.setPositionIncrement(clone.getPositionIncrement()); - payloadAtt.setPayload(clone.getPayload()); - exhausted = true; - return true; } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (working copy) @@ -21,15 +21,13 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import junit.framework.TestCase; - /** * Test the Arabic Analyzer * */ -public class TestArabicAnalyzer extends TestCase { +public class TestArabicAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the * stopwords file is missing in classpath */ @@ -74,33 +72,4 @@ assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] { "english", "text" }); } - - private void assertAnalyzesTo(Analyzer a, String input, String[] output) - throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(output[i], termAtt.term()); - } - - assertFalse(ts.incrementToken()); - ts.close(); - } - - private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) - throws Exception { - TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(output[i], termAtt.term()); - } - - assertFalse(ts.incrementToken()); - } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy) @@ -20,15 +20,14 @@ import java.io.IOException; import java.io.StringReader; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter * */ -public class TestArabicNormalizationFilter extends TestCase { +public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase { public void testAlifMadda() throws IOException { check("آجن", "اجن"); @@ -89,11 +88,7 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); - TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); - - assertTrue(filter.incrementToken()); - assertEquals(expected, termAtt.term()); - filter.close(); + assertTokenStreamContents(filter, new String[]{expected}); } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (working copy) @@ -20,15 +20,14 @@ import java.io.IOException; import java.io.StringReader; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter * */ -public class TestArabicStemFilter extends TestCase { +public class TestArabicStemFilter extends BaseTokenStreamTestCase { public void testAlPrefix() throws IOException { check("الحسن", "حسن"); @@ -117,11 +116,7 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicStemFilter filter = new ArabicStemFilter(tokenStream); - TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); - - assertTrue(filter.incrementToken()); - assertEquals(expected, termAtt.term()); - filter.close(); + assertTokenStreamContents(filter, new String[]{expected}); } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (working copy) @@ -17,11 +17,7 @@ * limitations under the License. */ -import java.io.IOException; -import java.io.StringReader; - -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -32,9 +28,9 @@ * It is very similar to the snowball portuguese algorithm but not exactly the same. * */ -public class TestBrazilianStemmer extends TestCase { +public class TestBrazilianStemmer extends BaseTokenStreamTestCase { - public void testWithSnowballExamples() throws IOException { + public void testWithSnowballExamples() throws Exception { check("boa", "boa"); check("boainain", "boainain"); check("boas", "boas"); @@ -150,23 +146,13 @@ a.setStemExclusionTable(new String[] { "quintessência" }); checkReuse(a, "quintessência", "quintessência"); } - - private void check(final String input, final String expected) throws IOException { - Analyzer analyzer = new BrazilianAnalyzer(); - TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); - TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class); - assertTrue(stream.incrementToken()); - assertEquals(expected, text.term()); - assertFalse(stream.incrementToken()); - stream.close(); + + private void check(final String input, final String expected) throws Exception { + checkOneTerm(new BrazilianAnalyzer(), input, expected); } - private void checkReuse(Analyzer analyzer, final String input, final String expected) throws IOException { - TokenStream stream = analyzer.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class); - assertTrue(stream.incrementToken()); - assertEquals(expected, text.term()); - assertFalse(stream.incrementToken()); + private void checkReuse(Analyzer a, String input, String expected) throws Exception { + checkOneTermReuse(a, input, expected); } } \ No newline at end of file Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (working copy) @@ -20,8 +20,7 @@ import java.io.IOException; import java.io.StringReader; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -29,7 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -public class TestCJKTokenizer extends TestCase{ +public class TestCJKTokenizer extends BaseTokenStreamTestCase { class TestToken { String termText; Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy) @@ -21,17 +21,15 @@ import java.io.Reader; import java.io.StringReader; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -public class TestChineseTokenizer extends TestCase +public class TestChineseTokenizer extends BaseTokenStreamTestCase { public void testOtherLetterOffset() throws IOException { @@ -116,34 +114,5 @@ assertAnalyzesTo(justFilter, "This is a Test. b c d", new String[] { "This", "Test." }); } - - private void assertAnalyzesTo(Analyzer a, String input, String[] output) - throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(output[i], termAtt.term()); - } - - assertFalse(ts.incrementToken()); - ts.close(); - } - - private void assertAnalyzesToReuse(Analyzer a, String input, String[] output, - int startOffsets[], int endOffsets[]) - throws Exception { - TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(output[i], termAtt.term()); - } - - assertFalse(ts.incrementToken()); - } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (working copy) @@ -31,8 +31,7 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; @@ -41,7 +40,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -public class TestCompoundWordTokenFilter extends TestCase { +public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { private static String[] locations = { "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", "http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", @@ -76,7 +75,7 @@ dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); - assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind", + assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind", "fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere", "abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] { 29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0, @@ -101,7 +100,7 @@ "Rindfleischüberwachungsgesetz")), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); - assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", + assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] { 0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0, 0, 0, 0 }); @@ -118,7 +117,7 @@ "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")), dict); - assertFiltersTo(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor", + assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor", "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas", @@ -147,7 +146,7 @@ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true); - assertFiltersTo(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas", + assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas", "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8, 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0, 0, 0 }); @@ -185,22 +184,6 @@ assertEquals("Rindfleischüberwachungsgesetz", termAtt.term()); } - private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset, - int[] endOffset, int[] posIncr) throws Exception { - TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) tf.getAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tf.getAttribute(PositionIncrementAttribute.class); - - for (int i = 0; i < s.length; ++i) { - assertTrue(tf.incrementToken()); - assertEquals(s[i], termAtt.term()); - assertEquals(startOffset[i], offsetAtt.startOffset()); - assertEquals(endOffset[i], offsetAtt.endOffset()); - assertEquals(posIncr[i], posIncAtt.getPositionIncrement()); - } - assertFalse(tf.incrementToken()); - } - private void getHyphenationPatternFileContents() { if (patternsFileContent == null) { try { Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 806985) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (working copy) @@ -21,13 +21,10 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.StringReader; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the CzechAnalyzer @@ -35,7 +32,7 @@ * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list. * */ -public class TestCzechAnalyzer extends TestCase { +public class TestCzechAnalyzer extends BaseTokenStreamTestCase { File dataDir = new File(System.getProperty("dataDir", "./bin")); File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt"); @@ -85,24 +82,4 @@ assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" }); } - private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class); - for (int i=0; i", "", "", "", "", "" }); } */ - - public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) - throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - if (startOffsets != null) - assertEquals(offsetAtt.startOffset(), startOffsets[i]); - if (endOffsets != null) - assertEquals(offsetAtt.endOffset(), endOffsets[i]); - if (types != null) - assertEquals(typeAtt.type(), types[i]); - } - assertFalse(ts.incrementToken()); - ts.close(); - } - - public void assertAnalyzesToReuse(Analyzer a, String input, String[] output) - throws Exception { - - TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts - .addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) ts - .addAttribute(TypeAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - } - assertFalse(ts.incrementToken()); - } - - public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { - assertAnalyzesTo(a, input, output, null, null, null); - } - - public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception { - assertAnalyzesTo(a, input, output, null, null, types); - } - - public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { - assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null); - } - public void testAnalyzer() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer(); Index: contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 806985) +++ contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -20,20 +20,13 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.Reader; -import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.Date; -import junit.framework.TestCase; - +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -public class TestSmartChineseAnalyzer extends TestCase { +public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { public void testChineseStopWordsDefault() throws Exception { Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ @@ -77,20 +70,11 @@ assertAnalyzesTo(ca, sentence, result); } - public void testChineseAnalyzer() throws IOException { - Token nt = new Token(); + public void testChineseAnalyzer() throws Exception { Analyzer ca = new SmartChineseAnalyzer(true); - Reader sentence = new StringReader("我购买了道具和服装。"); + String sentence = "我购买了道具和服装。"; String[] result = { "我", "购买", "了", "道具", "和", "服装" }; - TokenStream ts = ca.tokenStream("sentence", sentence); - int i = 0; - nt = ts.next(nt); - while (nt != null) { - assertEquals(result[i], nt.term()); - i++; - nt = ts.next(nt); - } - ts.close(); + assertAnalyzesTo(ca, sentence, result); } /* @@ -165,90 +149,4 @@ new int[] { 0, 1, 3, 4, 6, 7 }, new int[] { 1, 3, 4, 6, 7, 9 }); } - - public void assertAnalyzesToReuse(Analyzer a, String input, String[] output, - int startOffsets[], int endOffsets[]) throws Exception { - - TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - assertEquals(offsetAtt.startOffset(), startOffsets[i]); - assertEquals(offsetAtt.endOffset(), endOffsets[i]); - } - assertFalse(ts.incrementToken()); - } - - public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) - throws Exception { - - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - if (startOffsets != null) - assertEquals(offsetAtt.startOffset(), startOffsets[i]); - if (endOffsets != null) - assertEquals(offsetAtt.endOffset(), endOffsets[i]); - if (types != null) - assertEquals(typeAtt.type(), types[i]); - } - assertFalse(ts.incrementToken()); - ts.close(); - } - -public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { - assertAnalyzesTo(a, input, output, null, null, null); } - -public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception { - assertAnalyzesTo(a, input, output, null, null, types); -} - -public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { - assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null); -} - - - /** - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { - new TestSmartChineseAnalyzer().sampleMethod(); - } - - /** - * @throws UnsupportedEncodingException - * @throws FileNotFoundException - * @throws IOException - */ - private void sampleMethod() throws UnsupportedEncodingException, - FileNotFoundException, IOException { - Token nt = new Token(); - Analyzer ca = new SmartChineseAnalyzer(true); - Reader sentence = new StringReader( - "我从小就不由自主地认为自己长大以后一定得成为一个象我父亲一样的画家, 可能是父母潜移默化的影响。其实我根本不知道作为画家意味着什么,我是否喜欢,最重要的是否适合我,我是否有这个才华。其实人到中年的我还是不确定我最喜欢什么,最想做的是什么?我相信很多人和我一样有同样的烦恼。毕竟不是每个人都能成为作文里的宇航员,科学家和大教授。知道自己适合做什么,喜欢做什么,能做好什么其实是个非常困难的问题。" - + "幸运的是,我想我的孩子不会为这个太过烦恼。通过老大,我慢慢发现美国高中的一个重要功能就是帮助学生分析他们的专长和兴趣,从而帮助他们选择大学的专业和未来的职业。我觉得帮助一个未成形的孩子找到她未来成长的方向是个非常重要的过程。" - + "美国高中都有专门的职业顾问,通过接触不同的课程,和各种心理,个性,兴趣很多方面的问答来帮助每个学生找到最感兴趣的专业。这样的教育一般是要到高年级才开始, 可老大因为今年上计算机的课程就是研究一个职业走向的软件项目,所以她提前做了这些考试和面试。看来以后这样的教育会慢慢由电脑来测试了。老大带回家了一些试卷,我挑出一些给大家看看。这门课她花了2个多月才做完,这里只是很小的一部分。" - + "在测试里有这样的一些问题:" - + "你是个喜欢动手的人吗? 你喜欢修东西吗?你喜欢体育运动吗?你喜欢在室外工作吗?你是个喜欢思考的人吗?你喜欢数学和科学课吗?你喜欢一个人工作吗?你对自己的智力自信吗?你的创造能力很强吗?你喜欢艺术,音乐和戏剧吗? 你喜欢自由自在的工作环境吗?你喜欢尝试新的东西吗? 你喜欢帮助别人吗?你喜欢教别人吗?你喜欢和机器和工具打交道吗?你喜欢当领导吗?你喜欢组织活动吗?你什么和数字打交道吗?"); - TokenStream ts = ca.tokenStream("sentence", sentence); - - System.out.println("start: " + (new Date())); - long before = System.currentTimeMillis(); - nt = ts.next(nt); - while (nt != null) { - System.out.println(nt.term()); - nt = ts.next(nt); - } - ts.close(); - long now = System.currentTimeMillis(); - System.out.println("time: " + (now - before) / 1000.0 + " s"); - } -} Index: contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java (revision 806985) +++ contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java (working copy) @@ -28,9 +28,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.BaseTokenStreamTestCase; public class TestSynonymTokenFilter extends BaseTokenStreamTestCase { @@ -117,44 +114,4 @@ } } - public void assertAnalyzesTo(Analyzer a, String input, String[] output, - int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { - - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts - .getAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts - .getAttribute(PositionIncrementAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - assertEquals(offsetAtt.startOffset(), startOffsets[i]); - assertEquals(offsetAtt.endOffset(), endOffsets[i]); - assertEquals(posIncAtt.getPositionIncrement(), posIncs[i]); - } - assertFalse(ts.incrementToken()); - ts.close(); - } - - public void assertAnalyzesToReuse(Analyzer a, String input, String[] output, - int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { - - TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts - .getAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts - .getAttribute(PositionIncrementAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(termAtt.term(), output[i]); - assertEquals(offsetAtt.startOffset(), startOffsets[i]); - assertEquals(offsetAtt.endOffset(), endOffsets[i]); - assertEquals(posIncAtt.getPositionIncrement(), posIncs[i]); - } - assertFalse(ts.incrementToken()); - } } Index: src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java =================================================================== --- src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (revision 806985) +++ src/test/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (working copy) @@ -18,7 +18,10 @@ */ import java.util.Set; +import java.io.StringReader; +import java.io.IOException; +import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.LuceneTestCase; /** @@ -59,12 +62,6 @@ } // @Override - protected void tearDown() throws Exception { - TokenStream.setOnlyUseNewAPI(false); - super.tearDown(); - } - - // @Override public void runBare() throws Throwable { // Do the test with onlyUseNewAPI=false (default) try { @@ -86,5 +83,127 @@ } } } + + // some helpers to test Analyzers and TokenStreams: + + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + assertNotNull(output); + assertTrue("has TermAttribute", ts.hasAttribute(TermAttribute.class)); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + + OffsetAttribute offsetAtt = null; + if (startOffsets != null || endOffsets != null) { + assertTrue("has OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); + offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); + } + + TypeAttribute typeAtt = null; + if (types != null) { + assertTrue("has TypeAttribute", ts.hasAttribute(TypeAttribute.class)); + typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); + } + + PositionIncrementAttribute posIncrAtt = null; + if (posIncrements != null) { + assertTrue("has PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); + posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); + } + + ts.reset(); + for (int i = 0; i < output.length; i++) { + assertTrue("token "+i+" exists", ts.incrementToken()); + assertEquals("term "+i, output[i], termAtt.term()); + if (startOffsets != null) + assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset()); + if (endOffsets != null) + assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset()); + if (types != null) + assertEquals("type "+i, types[i], typeAtt.type()); + if (posIncrements != null) + assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); + } + assertFalse("end of stream", ts.incrementToken()); + ts.close(); + } + + public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { + assertTokenStreamContents(ts, output, null, null, null, null); + } + + public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException { + assertTokenStreamContents(ts, output, null, null, types, null); + } + + public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException { + assertTokenStreamContents(ts, output, null, null, null, posIncrements); + } + + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null); + } + + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { + assertAnalyzesTo(a, input, output, null, null, null, null); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException { + assertAnalyzesTo(a, input, output, null, null, types, null); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { + assertAnalyzesTo(a, input, output, null, null, null, posIncrements); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements); + } + + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements); + } + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException { + assertAnalyzesToReuse(a, input, output, null, null, null, null); + } + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, String[] types) throws IOException { + assertAnalyzesToReuse(a, input, output, null, null, types, null); + } + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { + assertAnalyzesToReuse(a, input, output, null, null, null, posIncrements); + } + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { + assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, null); + } + + public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { + assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements); + } + + // simple utility method for testing stemmers + + public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException { + assertAnalyzesTo(a, input, new String[]{expected}); + } + + public static void checkOneTermReuse(Analyzer a, final String input, final String expected) throws IOException { + assertAnalyzesToReuse(a, input, new String[]{expected}); + } + } Index: src/test/org/apache/lucene/analysis/BaseTokenTestCase.java =================================================================== --- src/test/org/apache/lucene/analysis/BaseTokenTestCase.java (revision 806985) +++ src/test/org/apache/lucene/analysis/BaseTokenTestCase.java (working copy) @@ -1,157 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; - -public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase { - - public static String tsToString(TokenStream in) throws IOException { - final TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class); - final StringBuffer out = new StringBuffer(); - in.reset(); - while (in.incrementToken()) { - if (out.length()>0) out.append(' '); - out.append(termAtt.term()); - } - in.close(); - return out.toString(); - } - - public void assertTokEqual(List/**/ a, List/**/ b) { - assertTokEq(a,b,false); - assertTokEq(b,a,false); - } - - public void assertTokEqualOff(List/**/ a, List/**/ b) { - assertTokEq(a,b,true); - assertTokEq(b,a,true); - } - - private void assertTokEq(List/**/ a, List/**/ b, boolean checkOff) { - int pos=0; - for (Iterator iter = a.iterator(); iter.hasNext();) { - Token tok = (Token)iter.next(); - pos += tok.getPositionIncrement(); - if (!tokAt(b, tok.term(), pos - , checkOff ? tok.startOffset() : -1 - , checkOff ? tok.endOffset() : -1 - )) - { - fail(a + "!=" + b); - } - } - } - - public boolean tokAt(List/**/ lst, String val, int tokPos, int startOff, int endOff) { - int pos=0; - for (Iterator iter = lst.iterator(); iter.hasNext();) { - Token tok = (Token)iter.next(); - pos += tok.getPositionIncrement(); - if (pos==tokPos && tok.term().equals(val) - && (startOff==-1 || tok.startOffset()==startOff) - && (endOff ==-1 || tok.endOffset() ==endOff ) - ) - { - return true; - } - } - return false; - } - - - /*** - * Return a list of tokens according to a test string format: - * a b c => returns List [a,b,c] - * a/b => tokens a and b share the same spot (b.positionIncrement=0) - * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0) - * a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11 - */ - public List/**/ tokens(String str) { - String[] arr = str.split(" "); - List/**/ result = new ArrayList/**/(); - for (int i=0; i 1) { - posInc = Integer.parseInt(params[1]); - } else { - posInc = 1; - } - - if (params.length > 2) { - start = Integer.parseInt(params[2]); - } else { - start = 0; - } - - if (params.length > 3) { - end = Integer.parseInt(params[3]); - } else { - end = start + params[0].length(); - } - - Token t = new Token(params[0],start,end,"TEST"); - t.setPositionIncrement(posInc); - - result.add(t); - for (int j=1; j*/ getTokens(TokenStream tstream) throws IOException { - List/**/ tokens = new ArrayList/**/(); - tstream.reset(); - while (tstream.incrementToken()) { - final Token t = new Token(); - for (Iterator it = tstream.getAttributeImplsIterator(); it.hasNext();) { - final AttributeImpl att = (AttributeImpl) it.next(); - try { - att.copyTo(t); - } catch (ClassCastException ce) { - // ignore Attributes unsupported by Token - } - } - tokens.add(t); - } - tstream.close(); - - return tokens; - } - -} Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 806985) +++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -33,19 +33,6 @@ super(name); } - public void assertAnalyzesTo(Analyzer a, - String input, - String[] output) throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); - for (int i=0; i