Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (working copy) @@ -76,7 +76,7 @@ * MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}). */ public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false); + this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true); } /** @@ -93,7 +93,8 @@ public TokenStreamComponents createComponents(String fieldName, Reader reader) { MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength); tokenizer.setEnableChecks(enableChecks); - TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements); + MockTokenFilter filt = new MockTokenFilter(tokenizer, filter); + filt.setEnablePositionIncrements(enablePositionIncrements); return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName)); } Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (working copy) @@ -34,7 +34,9 @@ // TODO: instead of fixed remainder... maybe a fixed // random seed? this.remainder = remainder; - assert remainder >= 0 && remainder < 10 : "invalid parameter"; + if (remainder < 0 || remainder >= 10) { + throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder); + } } // for testing only, uses a remainder of 0 Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java (working copy) @@ -34,6 +34,9 @@ public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) { super(in); + if (length < 0) { + throw new IllegalArgumentException("length must be >= 0"); + } this.random = random; this.bytes = new byte[length]; this.payload = new Payload(bytes); Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (working copy) @@ -55,7 +55,7 @@ makeString("with")))); private final CharacterRunAutomaton filter; - private boolean enablePositionIncrements = false; + private boolean enablePositionIncrements = true; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); @@ -67,14 +67,16 @@ * @param filter DFA representing the terms that should be removed. * @param enablePositionIncrements true if the removal should accumulate position increments. */ - public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) { + public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) { super(input); this.filter = filter; - this.enablePositionIncrements = enablePositionIncrements; } @Override public boolean incrementToken() throws IOException { + // TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return + // initial token with posInc=0 ever + // return the first non-stop word found int skippedPositions = 0; while (input.incrementToken()) { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -43,16 +43,25 @@ public KeywordTokenizer(Reader input, int bufferSize) { super(input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) { super(source, input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { super(factory, input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (working copy) @@ -65,6 +65,12 @@ public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(input); + if (bufferSize < 0) { + throw new IllegalArgumentException("bufferSize cannot be negative"); + } + if (skip < 0) { + throw new IllegalArgumentException("skip cannot be negative"); + } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; @@ -85,10 +91,11 @@ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); private int startPosition = 0; - private int finalOffset = 0; private int skipped = 0; private boolean endDelimiter = false; private StringBuilder resultToken; + + private int charsRead = 0; @Override @@ -112,12 +119,13 @@ while (true) { int c = input.read(); - if( c < 0 ){ + if (c >= 0) { + charsRead++; + } else { if( skipped > skip ) { length += resultToken.length(); termAtt.setLength(length); - finalOffset = correctOffset(startPosition + length); - offsetAtt.setOffset(correctOffset(startPosition), finalOffset); + offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); if( added ){ resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); @@ -125,7 +133,6 @@ return added; } else{ - finalOffset = correctOffset(startPosition + length); return false; } } @@ -168,8 +175,7 @@ } length += resultToken.length(); termAtt.setLength(length); - finalOffset = correctOffset(startPosition + length); - offsetAtt.setOffset(correctOffset(startPosition), finalOffset); + offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length)); resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); return true; @@ -178,14 +184,15 @@ @Override public final void end() { // set final offset + int finalOffset = correctOffset(charsRead); offsetAtt.setOffset(finalOffset, finalOffset); } @Override - public void reset(Reader input) throws IOException { - super.reset(input); + public void reset() throws IOException { + super.reset(); resultToken.setLength(0); - finalOffset = 0; + charsRead = 0; endDelimiter = false; skipped = 0; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (working copy) @@ -77,6 +77,13 @@ public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(input); + if (bufferSize < 0) { + throw new IllegalArgumentException("bufferSize cannot be negative"); + } + if (skip < 0) { + // nocommit: not quite right right here: see line 84... if skip > numTokensFound we always get a NegativeArrayException? needs fixing! + throw new IllegalArgumentException("skip cannot be negative"); + } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; @@ -137,7 +144,11 @@ } resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0); resultToken.setLength(0); - endPosition = delimiterPositions.get(delimitersCount-1 - skip); + int idx = delimitersCount-1 - skip; + if (idx >= 0) { + // otherwise its ok, because we will skip and return false + endPosition = delimiterPositions.get(idx); + } finalOffset = correctOffset(length); posAtt.setPositionIncrement(1); } @@ -163,10 +174,11 @@ } @Override - public void reset(Reader input) throws IOException { - super.reset(input); + public void reset() throws IOException { + super.reset(); resultToken.setLength(0); finalOffset = 0; + endPosition = 0; skipped = 0; delimitersCount = -1; delimiterPositions.clear(); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy) @@ -71,6 +71,10 @@ this.group = group; fillBuffer(str, input); matcher = pattern.matcher(str); + // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher + if (group >= 0 && group > matcher.groupCount()) { + throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups"); + } index = 0; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy) @@ -57,6 +57,9 @@ */ public PositionFilter(final TokenStream input, final int positionIncrement) { super(input); + if (positionIncrement < 0) { + throw new IllegalArgumentException("positionIncrement may not be negative"); + } this.positionIncrement = positionIncrement; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy) @@ -67,7 +67,7 @@ Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class); stemmer = stemClass.newInstance(); } catch (Exception e) { - throw new RuntimeException(e.toString()); + throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1311156) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy) @@ -18,17 +18,26 @@ */ import java.io.File; +import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Modifier; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; +import java.util.HashSet; import java.util.List; import java.util.Random; +import java.util.Set; +import java.util.Map; +import java.util.IdentityHashMap; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -36,67 +45,113 @@ import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.EmptyTokenizer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.snowball.TestSnowball; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.tartarus.snowball.SnowballProgram; +import org.xml.sax.InputSource; /** tests random analysis chains */ public class TestRandomChains extends BaseTokenStreamTestCase { - static List> tokenizers; - static List> tokenfilters; - static List> charfilters; + static List> tokenizers; + static List> tokenfilters; + static List> charfilters; @BeforeClass public static void beforeClass() throws Exception { List> analysisClasses = new ArrayList>(); getClassesForPackage("org.apache.lucene.analysis", analysisClasses); - tokenizers = new ArrayList>(); - tokenfilters = new ArrayList>(); - charfilters = new ArrayList>(); - for (Class c : analysisClasses) { - // don't waste time with abstract classes or deprecated known-buggy ones + tokenizers = new ArrayList>(); + tokenfilters = new ArrayList>(); + charfilters = new ArrayList>(); + for (final Class c : analysisClasses) { final int modifiers = c.getModifiers(); - if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) - || c.getAnnotation(Deprecated.class) != null - || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() - // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt - || c.equals(EmptyTokenizer.class) - // doesn't actual reset itself! - || c.equals(CachingTokenFilter.class) - // broken! - || c.equals(NGramTokenizer.class) - // broken! - || c.equals(NGramTokenFilter.class) - // broken! - || c.equals(EdgeNGramTokenizer.class) - // broken! - || c.equals(EdgeNGramTokenFilter.class)) { + if ( + // don't waste time with abstract classes or deprecated known-buggy ones + Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) + || c.isAnnotationPresent(Deprecated.class) + || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() + || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c)) + // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt + || c == EmptyTokenizer.class + // doesn't actual reset itself! + || c == CachingTokenFilter.class + // doesn't consume whole stream! + || c == LimitTokenCountFilter.class + // broken! + || c == NGramTokenizer.class + // broken! + || c == NGramTokenFilter.class + // broken! + || c == EdgeNGramTokenizer.class + // broken! + || c == EdgeNGramTokenFilter.class + ) { continue; } - if (Tokenizer.class.isAssignableFrom(c)) { - tokenizers.add(c.asSubclass(Tokenizer.class)); - } else if (TokenFilter.class.isAssignableFrom(c)) { - tokenfilters.add(c.asSubclass(TokenFilter.class)); - } else if (CharStream.class.isAssignableFrom(c)) { - charfilters.add(c.asSubclass(CharStream.class)); + for (final Constructor ctor : c.getConstructors()) { + // don't test deprecated ctors, they likely have known bugs: + if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) { + continue; + } + if (Tokenizer.class.isAssignableFrom(c)) { + assertTrue(ctor.toGenericString() + " has unsupported parameter types", + allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + tokenizers.add(castConstructor(Tokenizer.class, ctor)); + } else if (TokenFilter.class.isAssignableFrom(c)) { + assertTrue(ctor.toGenericString() + " has unsupported parameter types", + allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + tokenfilters.add(castConstructor(TokenFilter.class, ctor)); + } else if (CharStream.class.isAssignableFrom(c)) { + assertTrue(ctor.toGenericString() + " has unsupported parameter types", + allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); + charfilters.add(castConstructor(CharStream.class, ctor)); + } else { + fail("Cannot get here"); + } } } - final Comparator> classComp = new Comparator>() { + + final Comparator> ctorComp = new Comparator>() { @Override - public int compare(Class arg0, Class arg1) { - return arg0.getName().compareTo(arg1.getName()); + public int compare(Constructor arg0, Constructor arg1) { + return arg0.toGenericString().compareTo(arg1.toGenericString()); } }; - Collections.sort(tokenizers, classComp); - Collections.sort(tokenfilters, classComp); - Collections.sort(charfilters, classComp); + Collections.sort(tokenizers, ctorComp); + Collections.sort(tokenfilters, ctorComp); + Collections.sort(charfilters, ctorComp); + if (VERBOSE) { System.out.println("tokenizers = " + tokenizers); System.out.println("tokenfilters = " + tokenfilters); @@ -111,6 +166,304 @@ charfilters = null; } + /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility. + * {@code Class#getConstructors()} should return unmodifiable {@code List>} not array! */ + @SuppressWarnings("unchecked") + private static Constructor castConstructor(Class instanceClazz, Constructor ctor) { + return (Constructor) ctor; + } + + private static interface ArgProducer { + Object create(Random random); + } + + private static final Map,ArgProducer> argProducers = new IdentityHashMap,ArgProducer>() {{ + put(int.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: could cause huge ram usage to use full int range for some filters + // (e.g. allocate enormous arrays) + // return Integer.valueOf(random.nextInt()); + return Integer.valueOf(_TestUtil.nextInt(random, -100, 100)); + } + }); + put(char.class, new ArgProducer() { + @Override public Object create(Random random) { + return Character.valueOf((char)random.nextInt(65536)); + } + }); + put(float.class, new ArgProducer() { + @Override public Object create(Random random) { + return Float.valueOf(random.nextFloat()); + } + }); + put(boolean.class, new ArgProducer() { + @Override public Object create(Random random) { + return Boolean.valueOf(random.nextBoolean()); + } + }); + put(byte.class, new ArgProducer() { + @Override public Object create(Random random) { + byte bytes[] = new byte[1]; + random.nextBytes(bytes); + return Byte.valueOf(bytes[0]); + } + }); + put(byte[].class, new ArgProducer() { + @Override public Object create(Random random) { + byte bytes[] = new byte[random.nextInt(256)]; + random.nextBytes(bytes); + return bytes; + } + }); + put(Random.class, new ArgProducer() { + @Override public Object create(Random random) { + return new Random(random.nextLong()); + } + }); + put(Version.class, new ArgProducer() { + @Override public Object create(Random random) { + // we expect bugs in emulating old versions + return TEST_VERSION_CURRENT; + } + }); + put(Set.class, new ArgProducer() { + @Override public Object create(Random random) { + // TypeTokenFilter + Set set = new HashSet(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); + } + return set; + } + }); + put(Collection.class, new ArgProducer() { + @Override public Object create(Random random) { + // CapitalizationFilter + Collection col = new ArrayList(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + col.add(_TestUtil.randomSimpleString(random).toCharArray()); + } + return col; + } + }); + put(CharArraySet.class, new ArgProducer() { + @Override public Object create(Random random) { + int num = random.nextInt(10); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + set.add(_TestUtil.randomSimpleString(random)); + } + return set; + } + }); + put(Pattern.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: don't want to make the exponentially slow ones Dawid documents + // in TestPatternReplaceFilter, so dont use truly random patterns (for now) + return Pattern.compile("a"); + } + }); + put(PayloadEncoder.class, new ArgProducer() { + @Override public Object create(Random random) { + return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? + } + }); + put(HunspellDictionary.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: make nastier + InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff"); + InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic"); + try { + return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + }); + put(HyphenationTree.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: make nastier + try { + InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + return hyphenator; + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + } + }); + put(SnowballProgram.class, new ArgProducer() { + @Override public Object create(Random random) { + try { + String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)]; + Class clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class); + return clazz.newInstance(); + } catch (Exception ex) { + Rethrow.rethrow(ex); + return null; // unreachable code + } + } + }); + put(String.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: make nastier + if (random.nextBoolean()) { + // a token type + return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; + } else { + return _TestUtil.randomSimpleString(random); + } + } + }); + put(NormalizeCharMap.class, new ArgProducer() { + @Override public Object create(Random random) { + NormalizeCharMap map = new NormalizeCharMap(); + // we can't add duplicate keys, or NormalizeCharMap gets angry + Set keys = new HashSet(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + String key = _TestUtil.randomSimpleString(random); + if (!keys.contains(key)) { + map.add(key,_TestUtil.randomSimpleString(random)); + keys.add(key); + } + } + return map; + } + }); + put(CharacterRunAutomaton.class, new ArgProducer() { + @Override public Object create(Random random) { + // TODO: could probably use a purely random automaton + switch(random.nextInt(5)) { + case 0: return MockTokenizer.KEYWORD; + case 1: return MockTokenizer.SIMPLE; + case 2: return MockTokenizer.WHITESPACE; + case 3: return MockTokenFilter.EMPTY_STOPSET; + default: return MockTokenFilter.ENGLISH_STOPSET; + } + } + }); + put(CharArrayMap.class, new ArgProducer() { + @Override public Object create(Random random) { + int num = random.nextInt(10); + CharArrayMap map = new CharArrayMap(TEST_VERSION_CURRENT, num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random)); + } + return map; + } + }); + put(SynonymMap.class, new ArgProducer() { + @Override public Object create(Random random) { + SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean()); + } + try { + return b.build(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private String randomNonEmptyString(Random random) { + while(true) { + final String s = _TestUtil.randomUnicodeString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + }); + }}; + + static final Set> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; + static { + allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + allowedTokenizerArgs.addAll(argProducers.keySet()); + allowedTokenizerArgs.add(Reader.class); + allowedTokenizerArgs.add(AttributeFactory.class); + allowedTokenizerArgs.add(AttributeSource.class); + + allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + allowedTokenFilterArgs.addAll(argProducers.keySet()); + allowedTokenFilterArgs.add(TokenStream.class); + allowedTokenFilterArgs.add(CommonGramsFilter.class); + + allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + allowedCharFilterArgs.addAll(argProducers.keySet()); + allowedCharFilterArgs.add(Reader.class); + allowedCharFilterArgs.add(CharStream.class); + } + + @SuppressWarnings("unchecked") + static T createRandomArg(Random random, Class paramType) { + final ArgProducer producer = argProducers.get(paramType); + assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer); + return (T) producer.create(random); + } + + static Object[] newTokenizerArgs(Random random, Reader reader, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType == Reader.class) { + args[i] = reader; + } else if (paramType == AttributeFactory.class) { + // TODO: maybe the collator one...??? + args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; + } else if (paramType == AttributeSource.class) { + args[i] = null; // this always gives IAE: fine + } else { + args[i] = createRandomArg(random, paramType); + } + } + return args; + } + + static Object[] newCharFilterArgs(Random random, Reader reader, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType == Reader.class) { + args[i] = reader; + } else if (paramType == CharStream.class) { + args[i] = CharReader.get(reader); + } else { + args[i] = createRandomArg(random, paramType); + } + } + return args; + } + + static Object[] newFilterArgs(Random random, TokenStream stream, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType == TokenStream.class) { + args[i] = stream; + } else if (paramType == CommonGramsFilter.class) { + // CommonGramsQueryFilter takes this one explicitly + args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, createRandomArg(random, CharArraySet.class)); + } else { + args[i] = createRandomArg(random, paramType); + } + } + return args; + } + static class MockRandomAnalyzer extends Analyzer { final long seed; @@ -123,6 +476,8 @@ Random random = new Random(seed); TokenizerSpec tokenizerspec = newTokenizer(random, reader); TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer); + //System.out.println("seed=" + seed + ",tokenizerSpec=" + tokenizerspec.toString); + //System.out.println("seed=" + seed + ",tokenfilterSpec=" + filterspec.toString); return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream); } @@ -130,6 +485,7 @@ protected Reader initReader(Reader reader) { Random random = new Random(seed); CharFilterSpec charfilterspec = newCharFilterChain(random, reader); + //System.out.println("seed=" + seed + ",charFilterSpec=" + charfilterspec.toString); return charfilterspec.reader; } @@ -159,20 +515,27 @@ boolean success = false; while (!success) { try { - // TODO: check Reader+Version,Version+Reader too - // also look for other variants and handle them special - int idx = random.nextInt(tokenizers.size()); - try { - Constructor c = tokenizers.get(idx).getConstructor(Version.class, Reader.class); - spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader); - } catch (NoSuchMethodException e) { - Constructor c = tokenizers.get(idx).getConstructor(Reader.class); - spec.tokenizer = c.newInstance(reader); + final Constructor ctor = tokenizers.get(random.nextInt(tokenizers.size())); + final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes()); + spec.tokenizer = ctor.newInstance(args); + spec.toString = ctor.getDeclaringClass().getName() + ("(" + Arrays.toString(args) + ")"); + success = true; + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); } - spec.toString = tokenizers.get(idx).toString(); - success = true; - } catch (Exception e) { - // ignore + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } return spec; @@ -187,23 +550,32 @@ boolean success = false; while (!success) { try { - // TODO: also look for other variants and handle them special - int idx = random.nextInt(charfilters.size()); - try { - Constructor c = charfilters.get(idx).getConstructor(Reader.class); - spec.reader = c.newInstance(spec.reader); - } catch (NoSuchMethodException e) { - Constructor c = charfilters.get(idx).getConstructor(CharStream.class); - spec.reader = c.newInstance(CharReader.get(spec.reader)); - } + final Constructor ctor = charfilters.get(random.nextInt(charfilters.size())); + final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); + spec.reader = ctor.newInstance(args); if (descr.length() > 0) { descr.append(","); } - descr.append(charfilters.get(idx).toString()); + descr.append(ctor.getDeclaringClass().getName()); + descr.append("(" + Arrays.toString(args) + ")"); success = true; - } catch (Exception e) { - // ignore + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } } @@ -220,22 +592,31 @@ boolean success = false; while (!success) { try { - // TODO: also look for other variants and handle them special - int idx = random.nextInt(tokenfilters.size()); - try { - Constructor c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class); - spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream); - } catch (NoSuchMethodException e) { - Constructor c = tokenfilters.get(idx).getConstructor(TokenStream.class); - spec.stream = c.newInstance(spec.stream); - } + final Constructor ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); + final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); + spec.stream = ctor.newInstance(args); if (descr.length() > 0) { descr.append(","); } - descr.append(tokenfilters.get(idx).toString()); + descr.append(ctor.getDeclaringClass().getName()); + descr.append("(" + Arrays.toString(args) + ")"); success = true; - } catch (Exception e) { - // ignore + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } } @@ -263,7 +644,7 @@ int numIterations = atLeast(20); for (int i = 0; i < numIterations; i++) { MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong()); - if (VERBOSE) { + if (true || VERBOSE) { System.out.println("Creating random analyzer:" + a); } try { Index: modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 1311156) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy) @@ -142,14 +142,16 @@ } } + /** for testing purposes ONLY */ + public static String SNOWBALL_LANGS[] = { + "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English", + "Finnish", "French", "German2", "German", "Hungarian", "Irish", + "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese", + "Romanian", "Russian", "Spanish", "Swedish", "Turkish" + }; + public void testEmptyTerm() throws IOException { - String langs[] = { - "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English", - "Finnish", "French", "German2", "German", "Hungarian", "Irish", - "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese", - "Romanian", "Russian", "Spanish", "Swedish", "Turkish" - }; - for (final String lang : langs) { + for (final String lang : SNOWBALL_LANGS) { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) {