Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (working copy) @@ -76,7 +76,7 @@ * MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}). */ public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false); + this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true); } /** @@ -93,7 +93,8 @@ public TokenStreamComponents createComponents(String fieldName, Reader reader) { MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength); tokenizer.setEnableChecks(enableChecks); - TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements); + MockTokenFilter filt = new MockTokenFilter(tokenizer, filter); + filt.setEnablePositionIncrements(enablePositionIncrements); return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName)); } Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (working copy) @@ -34,7 +34,9 @@ // TODO: instead of fixed remainder... maybe a fixed // random seed? this.remainder = remainder; - assert remainder >= 0 && remainder < 10 : "invalid parameter"; + if (remainder < 0 || remainder >= 10) { + throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder); + } } // for testing only, uses a remainder of 0 Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java (working copy) @@ -34,6 +34,9 @@ public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) { super(in); + if (length < 0) { + throw new IllegalArgumentException("length must be >= 0"); + } this.random = random; this.bytes = new byte[length]; this.payload = new Payload(bytes); Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (revision 1311156) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (working copy) @@ -55,7 +55,7 @@ makeString("with")))); private final CharacterRunAutomaton filter; - private boolean enablePositionIncrements = false; + private boolean enablePositionIncrements = true; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); @@ -67,14 +67,16 @@ * @param filter DFA representing the terms that should be removed. * @param enablePositionIncrements true if the removal should accumulate position increments. */ - public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) { + public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) { super(input); this.filter = filter; - this.enablePositionIncrements = enablePositionIncrements; } @Override public boolean incrementToken() throws IOException { + // TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return + // initial token with posInc=0 ever + // return the first non-stop word found int skippedPositions = 0; while (input.incrementToken()) { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -43,16 +43,25 @@ public KeywordTokenizer(Reader input, int bufferSize) { super(input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) { super(source, input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { super(factory, input); + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be > 0"); + } termAtt.resizeBuffer(bufferSize); } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java (working copy) @@ -65,6 +65,12 @@ public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(input); + if (bufferSize < 0) { + throw new IllegalArgumentException("bufferSize cannot be negative"); + } + if (skip < 0) { + throw new IllegalArgumentException("skip cannot be negative"); + } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; @@ -85,10 +91,11 @@ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); private int startPosition = 0; - private int finalOffset = 0; private int skipped = 0; private boolean endDelimiter = false; private StringBuilder resultToken; + + private int charsRead = 0; @Override @@ -112,12 +119,13 @@ while (true) { int c = input.read(); - if( c < 0 ){ + if (c >= 0) { + charsRead++; + } else { if( skipped > skip ) { length += resultToken.length(); termAtt.setLength(length); - finalOffset = correctOffset(startPosition + length); - offsetAtt.setOffset(correctOffset(startPosition), finalOffset); + offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); if( added ){ resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); @@ -125,7 +133,6 @@ return added; } else{ - finalOffset = correctOffset(startPosition + length); return false; } } @@ -168,8 +175,7 @@ } length += resultToken.length(); termAtt.setLength(length); - finalOffset = correctOffset(startPosition + length); - offsetAtt.setOffset(correctOffset(startPosition), finalOffset); + offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length)); resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); return true; @@ -178,14 +184,15 @@ @Override public final void end() { // set final offset + int finalOffset = correctOffset(charsRead); offsetAtt.setOffset(finalOffset, finalOffset); } @Override - public void reset(Reader input) throws IOException { - super.reset(input); + public void reset() throws IOException { + super.reset(); resultToken.setLength(0); - finalOffset = 0; + charsRead = 0; endDelimiter = false; skipped = 0; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java (working copy) @@ -77,6 +77,13 @@ public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(input); + if (bufferSize < 0) { + throw new IllegalArgumentException("bufferSize cannot be negative"); + } + if (skip < 0) { + // nocommit: not quite right right here: see line 84... if skip > numTokensFound we always get a NegativeArrayException? needs fixing! + throw new IllegalArgumentException("skip cannot be negative"); + } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; @@ -137,7 +144,11 @@ } resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0); resultToken.setLength(0); - endPosition = delimiterPositions.get(delimitersCount-1 - skip); + int idx = delimitersCount-1 - skip; + if (idx >= 0) { + // otherwise its ok, because we will skip and return false + endPosition = delimiterPositions.get(idx); + } finalOffset = correctOffset(length); posAtt.setPositionIncrement(1); } @@ -163,10 +174,11 @@ } @Override - public void reset(Reader input) throws IOException { - super.reset(input); + public void reset() throws IOException { + super.reset(); resultToken.setLength(0); finalOffset = 0; + endPosition = 0; skipped = 0; delimitersCount = -1; delimiterPositions.clear(); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy) @@ -71,6 +71,10 @@ this.group = group; fillBuffer(str, input); matcher = pattern.matcher(str); + // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher + if (group >= 0 && group > matcher.groupCount()) { + throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups"); + } index = 0; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy) @@ -57,6 +57,9 @@ */ public PositionFilter(final TokenStream input, final int positionIncrement) { super(input); + if (positionIncrement < 0) { + throw new IllegalArgumentException("positionIncrement may not be negative"); + } this.positionIncrement = positionIncrement; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 1311156) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy) @@ -67,7 +67,7 @@ Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class); stemmer = stemClass.newInstance(); } catch (Exception e) { - throw new RuntimeException(e.toString()); + throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1311156) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy) @@ -18,17 +18,24 @@ */ import java.io.File; +import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Modifier; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; +import java.util.HashSet; import java.util.List; import java.util.Random; +import java.util.Set; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -36,16 +43,41 @@ import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.EmptyTokenizer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.snowball.TestSnowball; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.tartarus.snowball.SnowballProgram; +import org.xml.sax.InputSource; /** tests random analysis chains */ public class TestRandomChains extends BaseTokenStreamTestCase { @@ -64,12 +96,14 @@ // don't waste time with abstract classes or deprecated known-buggy ones final int modifiers = c.getModifiers(); if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) - || c.getAnnotation(Deprecated.class) != null + || c.isAnnotationPresent(Deprecated.class) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt || c.equals(EmptyTokenizer.class) // doesn't actual reset itself! || c.equals(CachingTokenFilter.class) + // doesn't consume whole stream! + || c.equals(LimitTokenCountFilter.class) // broken! || c.equals(NGramTokenizer.class) // broken! @@ -123,6 +157,8 @@ Random random = new Random(seed); TokenizerSpec tokenizerspec = newTokenizer(random, reader); TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer); + //System.out.println("seed=" + seed + ",tokenizerSpec=" + tokenizerspec.toString); + //System.out.println("seed=" + seed + ",tokenfilterSpec=" + filterspec.toString); return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream); } @@ -130,6 +166,7 @@ protected Reader initReader(Reader reader) { Random random = new Random(seed); CharFilterSpec charfilterspec = newCharFilterChain(random, reader); + //System.out.println("seed=" + seed + ",charFilterSpec=" + charfilterspec.toString); return charfilterspec.reader; } @@ -156,23 +193,44 @@ // create a new random tokenizer from classpath private TokenizerSpec newTokenizer(Random random, Reader reader) { TokenizerSpec spec = new TokenizerSpec(); + Class clazz = null; // out of the loop for debugging boolean success = false; while (!success) { try { - // TODO: check Reader+Version,Version+Reader too - // also look for other variants and handle them special int idx = random.nextInt(tokenizers.size()); - try { - Constructor c = tokenizers.get(idx).getConstructor(Version.class, Reader.class); - spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader); - } catch (NoSuchMethodException e) { - Constructor c = tokenizers.get(idx).getConstructor(Reader.class); - spec.tokenizer = c.newInstance(reader); + clazz = tokenizers.get(idx); + Constructor[] ctors = clazz.getConstructors(); + Arrays.sort(ctors, ctorComp); + @SuppressWarnings("unchecked") + Constructor ctor = (Constructor) ctors[random.nextInt(ctors.length)]; + if (ctor.isAnnotationPresent(Deprecated.class)) { + continue; // don't test deprecated ctors, they likely have known bugs } - spec.toString = tokenizers.get(idx).toString(); + Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes()); + spec.tokenizer = ctor.newInstance(args); + spec.toString = clazz.getName() + ("(" + Arrays.toString(args) + ")"); success = true; - } catch (Exception e) { - // ignore + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (UnsupportedOperationException uoe) { + // FIXME: this ex can only come from the random parameter generator + if (VERBOSE) { + System.err.println(uoe.getMessage()); + } + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } return spec; @@ -183,27 +241,54 @@ spec.reader = reader; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(3); + Class clazz = null; // out of the loop for debugging for (int i = 0; i < numFilters; i++) { boolean success = false; while (!success) { try { // TODO: also look for other variants and handle them special int idx = random.nextInt(charfilters.size()); - try { - Constructor c = charfilters.get(idx).getConstructor(Reader.class); - spec.reader = c.newInstance(spec.reader); - } catch (NoSuchMethodException e) { - Constructor c = charfilters.get(idx).getConstructor(CharStream.class); - spec.reader = c.newInstance(CharReader.get(spec.reader)); + clazz = charfilters.get(idx); + Constructor[] ctors = clazz.getConstructors(); + if (ctors.length == 0) { + continue; // CharReader: has only a static get.... } + Arrays.sort(ctors, ctorComp); + @SuppressWarnings("unchecked") + Constructor ctor = (Constructor) ctors[random.nextInt(ctors.length)]; + if (ctor.isAnnotationPresent(Deprecated.class)) { + continue; // don't test deprecated ctors, they likely have known bugs + } + Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); + spec.reader = ctor.newInstance(args); if (descr.length() > 0) { descr.append(","); } - descr.append(charfilters.get(idx).toString()); + descr.append(clazz.getName()); + descr.append("(" + Arrays.toString(args) + ")"); success = true; - } catch (Exception e) { - // ignore + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (UnsupportedOperationException uoe) { + // FIXME: this ex can only come from the random parameter generator + if (VERBOSE) { + System.err.println(uoe.getMessage()); + } + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } } @@ -216,32 +301,276 @@ spec.stream = tokenizer; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(5); + Class clazz = null; // out of the loop for debugging for (int i = 0; i < numFilters; i++) { boolean success = false; while (!success) { try { - // TODO: also look for other variants and handle them special int idx = random.nextInt(tokenfilters.size()); - try { - Constructor c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class); - spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream); - } catch (NoSuchMethodException e) { - Constructor c = tokenfilters.get(idx).getConstructor(TokenStream.class); - spec.stream = c.newInstance(spec.stream); + clazz = tokenfilters.get(idx); + Constructor[] ctors = clazz.getConstructors(); + Arrays.sort(ctors, ctorComp); + @SuppressWarnings("unchecked") + Constructor ctor = (Constructor) ctors[random.nextInt(ctors.length)]; + if (ctor.isAnnotationPresent(Deprecated.class)) { + continue; // don't test deprecated ctors, they likely have known bugs } + Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); + spec.stream = ctor.newInstance(args); if (descr.length() > 0) { descr.append(","); } - descr.append(tokenfilters.get(idx).toString()); + descr.append(clazz.getName()); + descr.append("(" + Arrays.toString(args) + ")"); success = true; - } catch (Exception e) { - // ignore + } catch (InvocationTargetException ite) { + final Throwable cause = ite.getCause(); + if (cause instanceof IllegalArgumentException || + cause instanceof UnsupportedOperationException) { + // thats ok, ignore + if (VERBOSE) { + System.err.println("Ignoring IAE/UOE from ctor:"); + cause.printStackTrace(System.err); + } + } else { + Rethrow.rethrow(cause); + } + } catch (UnsupportedOperationException uoe) { + // FIXME: this ex can only come from the random parameter generator + if (VERBOSE) { + System.err.println(uoe.getMessage()); + } + } catch (IllegalAccessException iae) { + Rethrow.rethrow(iae); + } catch (InstantiationException ie) { + Rethrow.rethrow(ie); } } } spec.toString = descr.toString(); return spec; } + + private Object[] newTokenizerArgs(Random random, Reader reader, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType.equals(Reader.class)) { + args[i] = reader; + } else if (paramType.equals(AttributeFactory.class)) { + // TODO: maybe the collator one...??? + args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; + } else if (paramType.equals(AttributeSource.class)) { + args[i] = null; // this always gives IAE: fine + } else { + args[i] = randomArg(random, paramType); + } + } + return args; + } + + private Object[] newCharFilterArgs(Random random, Reader reader, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType.equals(Reader.class)) { + args[i] = reader; + } else if (paramType.equals(CharStream.class)) { + args[i] = CharReader.get(reader); + } else { + args[i] = randomArg(random, paramType); + } + } + return args; + } + + private Object[] newFilterArgs(Random random, TokenStream stream, Class[] paramTypes) { + Object[] args = new Object[paramTypes.length]; + for (int i = 0; i < args.length; i++) { + Class paramType = paramTypes[i]; + if (paramType.equals(TokenStream.class)) { + args[i] = stream; + } else if (paramType.equals(CommonGramsFilter.class)) { + // CommonGramsQueryFilter takes this one explicitly + args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, randomCharArraySet(random)); + } else { + args[i] = randomArg(random, paramType); + } + } + return args; + } + + // TODO: we should sometimes pass null for some params? + private Object randomArg(Random random, Class paramType) { + if (paramType.equals(int.class)) { + // TODO: could cause huge ram usage to use full int range for some filters + // (e.g. allocate enormous arrays) + // return Integer.valueOf(random.nextInt()); + return Integer.valueOf(_TestUtil.nextInt(random, -100, 100)); + } else if (paramType.equals(char.class)) { + return Character.valueOf((char)random.nextInt(65536)); + } else if (paramType.equals(float.class)) { + return Float.valueOf(random.nextFloat()); + } else if (paramType.equals(boolean.class)) { + return Boolean.valueOf(random.nextBoolean()); + } else if (paramType.equals(byte.class)) { + byte bytes[] = new byte[1]; + random.nextBytes(bytes); + return Byte.valueOf(bytes[0]); + } else if (paramType.equals(byte[].class)) { + byte bytes[] = new byte[random.nextInt(256)]; + random.nextBytes(bytes); + return bytes; + } else if (paramType.equals(Random.class)) { + return new Random(random.nextLong()); + } else if (paramType.equals(Version.class)) { + // we expect bugs in emulating old versions + return TEST_VERSION_CURRENT; + } else if (paramType.equals(Set.class)) { + // TypeTokenFilter + Set set = new HashSet(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); + } + return set; + } else if (paramType.equals(Collection.class)) { + // CapitalizationFilter + Collection col = new ArrayList(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + col.add(_TestUtil.randomSimpleString(random).toCharArray()); + } + return col; + } else if (paramType.equals(CharArraySet.class)) { + return randomCharArraySet(random); + } else if (paramType.equals(Pattern.class)) { + // TODO: don't want to make the exponentially slow ones Dawid documents + // in TestPatternReplaceFilter, so dont use truly random patterns (for now) + return Pattern.compile("a"); + } else if (paramType.equals(PayloadEncoder.class)) { + return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? + } else if (paramType.equals(HunspellDictionary.class)) { + // TODO: make nastier + InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff"); + InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic"); + try { + return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } else if (paramType.equals(HyphenationTree.class)) { + // TODO: make nastier + try { + InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + return hyphenator; + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } else if (paramType.equals(SnowballProgram.class)) { + try { + String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)]; + Class clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class); + return clazz.newInstance(); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } else if (paramType.equals(String.class)) { + // TODO: make nastier + if (random.nextBoolean()) { + // a token type + return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; + } else { + return _TestUtil.randomSimpleString(random); + } + } else if (paramType.equals(NormalizeCharMap.class)) { + NormalizeCharMap map = new NormalizeCharMap(); + // we can't add duplicate keys, or NormalizeCharMap gets angry + Set keys = new HashSet(); + int num = random.nextInt(5); + for (int i = 0; i < num; i++) { + String key = _TestUtil.randomSimpleString(random); + if (!keys.contains(key)) { + map.add(key,_TestUtil.randomSimpleString(random)); + keys.add(key); + } + } + return map; + } else if (paramType.equals(CharacterRunAutomaton.class)) { + // TODO: could probably use a purely random automaton + switch(random.nextInt(5)) { + case 0: return MockTokenizer.KEYWORD; + case 1: return MockTokenizer.SIMPLE; + case 2: return MockTokenizer.WHITESPACE; + case 3: return MockTokenFilter.EMPTY_STOPSET; + default: return MockTokenFilter.ENGLISH_STOPSET; + } + } else if (paramType.equals(CharArrayMap.class)) { + return randomCharArrayMap(random); + } else if (paramType.equals(SynonymMap.class)) { + return randomSynonymMap(random); + } else { + throw new UnsupportedOperationException("Don't know how to make a random " + paramType); + } + } + + private CharArraySet randomCharArraySet(Random random) { + int num = random.nextInt(10); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + set.add(_TestUtil.randomSimpleString(random)); + } + return set; + } + + private CharArrayMap randomCharArrayMap(Random random) { + int num = random.nextInt(10); + CharArrayMap map = new CharArrayMap(TEST_VERSION_CURRENT, num, random.nextBoolean()); + for (int i = 0; i < num; i++) { + // TODO: make nastier + map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random)); + } + return map; + } + + // unapologetically stolen from TestSynonymMapFilter + private SynonymMap randomSynonymMap(Random random) { + SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean()); + } + try { + return b.build(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private String randomNonEmptyString(Random random) { + while(true) { + final String s = _TestUtil.randomUnicodeString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + + // TODO: better comparator? + final Comparator> ctorComp = new Comparator>() { + @Override + public int compare(Constructor arg0, Constructor arg1) { + return arg0.toGenericString().compareTo(arg1.toGenericString()); + } + }; } static class TokenizerSpec { @@ -263,7 +592,7 @@ int numIterations = atLeast(20); for (int i = 0; i < numIterations; i++) { MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong()); - if (VERBOSE) { + if (true || VERBOSE) { System.out.println("Creating random analyzer:" + a); } try { Index: modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 1311156) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy) @@ -142,14 +142,16 @@ } } + /** for testing purposes ONLY */ + public static String SNOWBALL_LANGS[] = { + "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English", + "Finnish", "French", "German2", "German", "Hungarian", "Irish", + "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese", + "Romanian", "Russian", "Spanish", "Swedish", "Turkish" + }; + public void testEmptyTerm() throws IOException { - String langs[] = { - "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English", - "Finnish", "French", "German2", "German", "Hungarian", "Irish", - "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese", - "Romanian", "Russian", "Spanish", "Swedish", "Turkish" - }; - for (final String lang : langs) { + for (final String lang : SNOWBALL_LANGS) { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) {