diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java index c316e1f..16b759e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java @@ -34,6 +34,9 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util._TestUtil; import org.junit.Ignore; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.carrotsearch.randomizedtesting.generators.RandomInts; + /** * Tests {@link PatternReplaceCharFilter} */ @@ -300,12 +303,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { - int numPatterns = atLeast(100); - long start = System.currentTimeMillis(); - long maxTime = 1000 * 2; + int numPatterns = 10 + random().nextInt(20); Random random = new Random(random().nextLong()); - for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) { + for (int i = 0; i < numPatterns; i++) { final Pattern p = _TestUtil.randomPattern(random()); + final String replacement = _TestUtil.randomSimpleString(random); Analyzer a = new Analyzer() { @Override @@ -319,9 +321,13 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { return new PatternReplaceCharFilter(p, replacement, CharReader.get(reader)); } }; - checkRandomData(random, a, 1000 * RANDOM_MULTIPLIER, - /* max input length. don't make it longer -- exponential processing - * time for certain patterns. */ 40, true); // only ascii + + /* max input length. don't make it longer -- exponential processing + * time for certain patterns. */ + final int maxInputLength = 30; + /* ASCII only input?: */ + final boolean asciiOnly = true; + checkRandomData(random, a, 250 * RANDOM_MULTIPLIER, maxInputLength, asciiOnly); } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index adb3595..5dc5e6e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -77,6 +77,10 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.junit.Assert; +import com.carrotsearch.randomizedtesting.generators.RandomInts; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; + /** * General utility methods for Lucene unit tests. */ @@ -293,37 +297,49 @@ public class _TestUtil { public static String randomRegexpishString(Random r) { return randomRegexpishString(r, 20); } - + + /** + * Maximum recursion bound for '+' and '*' replacements in + * {@link #randomRegexpishString(Random, int)}. + */ + private final static int maxRecursionBound = 5; + + /** + * Operators for {@link #randomRegexpishString(Random, int)}. + */ + private final static List ops = Arrays.asList( + ".", "?", + "{0," + maxRecursionBound + "}", // bounded replacement for '*' + "{1," + maxRecursionBound + "}", // bounded replacement for '+' + "(", + ")", + "-", + "[", + "]", + "|" + ); + /** * Returns a String thats "regexpish" (contains lots of operators typically found in regular expressions) * If you call this enough times, you might get a valid regex! + * + *

Note: to avoid practically endless backtracking patterns we replace asterisk and plus + * operators with bounded repetitions. See LUCENE-4111 for more info. + * + * @param maxLength A hint about maximum length of the regexpish string. It may be exceeded by a few characters. */ public static String randomRegexpishString(Random r, int maxLength) { - final int end = nextInt(r, 0, maxLength); - if (end == 0) { - // allow 0 length - return ""; - } - final char[] buffer = new char[end]; - for (int i = 0; i < end; i++) { - int t = r.nextInt(11); - if (t == 0) { - buffer[i] = (char) _TestUtil.nextInt(r, 97, 102); + final StringBuilder regexp = new StringBuilder(maxLength); + for (int i = nextInt(r, 0, maxLength); i > 0; i--) { + if (r.nextBoolean()) { + regexp.append((char) RandomInts.randomIntBetween(r, 'a', 'z')); + } else { + regexp.append(RandomPicks.randomFrom(r, ops)); } - else if (1 == t) buffer[i] = '.'; - else if (2 == t) buffer[i] = '?'; - else if (3 == t) buffer[i] = '*'; - else if (4 == t) buffer[i] = '+'; - else if (5 == t) buffer[i] = '('; - else if (6 == t) buffer[i] = ')'; - else if (7 == t) buffer[i] = '-'; - else if (8 == t) buffer[i] = '['; - else if (9 == t) buffer[i] = ']'; - else if (10 == t) buffer[i] = '|'; } - return new String(buffer, 0, end); + return regexp.toString(); } - + private static final String[] HTML_CHAR_ENTITIES = { "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde", "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", @@ -933,8 +949,9 @@ public class _TestUtil { Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random)); // Make sure the result of applying the pattern to a string with extended // unicode characters is a valid utf16 string. See LUCENE-4078 for discussion. - if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_"))) + if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_"))) { return p; + } } catch (PatternSyntaxException ignored) { // Loop trying until we hit something that compiles. }