Index: lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (revision 979453) +++ lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (working copy) @@ -77,12 +77,44 @@ return new String(buffer, 0, end); } - // picks a random int code point that this transition - // accepts, avoiding the surrogates range since they are - // "defined" in UTF32. Don't call this on a transition - // that only accepts UTF16 surrogate values!! + // picks a random int code point, avoiding surrogates; + // throws IllegalArgumentException if this transition only + // accepts surrogates private static int getRandomCodePoint(final Random r, final Transition t) { - return t.min+r.nextInt(t.max-t.min+1); + final int code; + if (t.max < UnicodeUtil.UNI_SUR_HIGH_START || + t.min > UnicodeUtil.UNI_SUR_HIGH_END) { + // easy: entire range is before or after surrogates + code = t.min+r.nextInt(t.max-t.min+1); + } else if (t.min >= UnicodeUtil.UNI_SUR_HIGH_START) { + if (t.max > UnicodeUtil.UNI_SUR_LOW_END) { + // after surrogates + code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(t.max-UnicodeUtil.UNI_SUR_LOW_END+1); + } else { + throw new IllegalArgumentException("transition accepts only surrogates: " + t); + } + } else if (t.max <= UnicodeUtil.UNI_SUR_LOW_END) { + if (t.min < UnicodeUtil.UNI_SUR_HIGH_START) { + // before surrogates + code = t.min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - t.min); + } else { + throw new IllegalArgumentException("transition accepts only surrogates: " + t); + } + } else { + // range includes all surrogates + int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - t.min; + int gap2 = t.max - UnicodeUtil.UNI_SUR_LOW_END; + int c = r.nextInt(gap1+gap2); + if (c < gap1) { + code = t.min + c; + } else { + code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1; + } + } + + assert code >= t.min && code <= t.max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END): + "code=" + code + " min=" + t.min + " max=" + t.max; + return code; } public static class RandomAcceptedStrings { @@ -206,7 +238,6 @@ } else { t = s.transitionsArray[r.nextInt(s.numTransitions)]; } - soFar.add(getRandomCodePoint(r, t)); s = t.to; } Index: lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (revision 979453) +++ lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (working copy) @@ -100,6 +100,9 @@ int[] acc = null; try { acc = rx.getRandomAcceptedString(r); + if (acc == null) { + continue; + } final String s = UnicodeUtil.newString(acc, 0, acc.length); assertTrue(BasicOperations.run(a, s)); } catch (Throwable t) { Index: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 979453) +++ lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (working copy) @@ -222,6 +222,9 @@ } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random); + if (codepoints == null) { + continue; + } try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 979453) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -701,4 +701,12 @@ } return sb.toString(); } + + public static boolean isHighSurrogate(int code) { + return code >= UNI_SUR_HIGH_START && code <= UNI_SUR_HIGH_END; + } + + public static boolean isLowSurrogate(int code) { + return code >= UNI_SUR_LOW_START && code <= UNI_SUR_LOW_END; + } }