Index: /luni/src/main/java/java/lang/Character.java =================================================================== --- /luni/src/main/java/java/lang/Character.java (revision 383745) +++ /luni/src/main/java/java/lang/Character.java (working copy) @@ -15,7 +15,6 @@ package java.lang; - import java.io.Serializable; import org.apache.harmony.luni.util.BinarySearch; @@ -20,17 +19,44 @@ import org.apache.harmony.luni.util.BinarySearch; - /** - * Characters are objects (i.e. non-base types) which represent char values. It - * also provides a number of methods for the lexicographic categorization of - * char values. + *

+ * Character is the wrapper for the primitive type char. This + * class also provides a number of utility methods for working with + * chars. + *

+ * + *

+ * Character data is based upon the Unicode Standard, 4.0. The Unicode + * specification, character tables and other information is available at http://www.unicode.org/. + *

+ * + *

+ * Unicode characters are referred to as code points. The range of valid + * code points is U+0000 to U+10FFFF. The Basic Multilingual Plane (BMP) + * is the code point range U+0000 to U+FFFF. Characters above the BMP are + * referred to as Supplementary Characters. On the Java platform, UTF-16 + * encoding and char pairs are used to represent code points in + * the supplementary range. A pair of char values that represent + * a supplementary character are made up of a high surrogate with a + * value range of 0xD800 to 0xDBFF and a low surrogate with a value + * range of 0xDC00 to 0xDFFF. + *

+ * + *

+ * On the Java platform a char value represents either a single + * BMP code point or a UTF-16 unit that's part of a surrogate pair. The + * int type is used to represent all Unicode code points. + *

+ * + * @since 1.0 */ public final class Character implements Serializable, Comparable { - + //TODO Add Comparable when support for generics is available. private static final long serialVersionUID = 3786198910865385080L; - final char value; + private final char value; /** * The minimum possible Character value. @@ -205,6 +231,7 @@ /** * Unicode category constant Pi. + * @since 1.4 */ public static final byte INITIAL_QUOTE_PUNCTUATION = 29; @@ -210,6 +237,7 @@ /** * Unicode category constant Pf. + * @since 1.4 */ public static final byte FINAL_QUOTE_PUNCTUATION = 30; @@ -215,6 +243,7 @@ /** * Unicode bidirectional constant. + * @since 1.4 */ public static final byte DIRECTIONALITY_UNDEFINED = -1; @@ -220,6 +249,7 @@ /** * Unicode bidirectional constant L. + * @since 1.4 */ public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; @@ -225,6 +255,7 @@ /** * Unicode bidirectional constant R. + * @since 1.4 */ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; @@ -230,6 +261,7 @@ /** * Unicode bidirectional constant AL. + * @since 1.4 */ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; @@ -235,6 +267,7 @@ /** * Unicode bidirectional constant EN. + * @since 1.4 */ public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; @@ -240,6 +273,7 @@ /** * Unicode bidirectional constant ES. + * @since 1.4 */ public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; @@ -245,6 +279,7 @@ /** * Unicode bidirectional constant ET. + * @since 1.4 */ public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; @@ -250,6 +285,7 @@ /** * Unicode bidirectional constant AN. + * @since 1.4 */ public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; @@ -255,6 +291,7 @@ /** * Unicode bidirectional constant CS. + * @since 1.4 */ public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; @@ -260,6 +297,7 @@ /** * Unicode bidirectional constant NSM. + * @since 1.4 */ public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; @@ -265,6 +303,7 @@ /** * Unicode bidirectional constant BN. + * @since 1.4 */ public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; @@ -270,6 +309,7 @@ /** * Unicode bidirectional constant B. + * @since 1.4 */ public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; @@ -275,6 +315,7 @@ /** * Unicode bidirectional constant S. + * @since 1.4 */ public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; @@ -280,6 +321,7 @@ /** * Unicode bidirectional constant WS. + * @since 1.4 */ public static final byte DIRECTIONALITY_WHITESPACE = 12; @@ -285,6 +327,7 @@ /** * Unicode bidirectional constant ON. + * @since 1.4 */ public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; @@ -290,6 +333,7 @@ /** * Unicode bidirectional constant LRE. + * @since 1.4 */ public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; @@ -295,6 +339,7 @@ /** * Unicode bidirectional constant LRO. + * @since 1.4 */ public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; @@ -300,6 +345,7 @@ /** * Unicode bidirectional constant RLE. + * @since 1.4 */ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; @@ -305,6 +351,7 @@ /** * Unicode bidirectional constant RLO. + * @since 1.4 */ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; @@ -310,8 +357,103 @@ /** * Unicode bidirectional constant PDF. + * @since 1.4 */ public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; + + /** + *

+ * Minimum value of a high surrogate or leading surrogate unit in UTF-16 + * encoding - '\uD800'. + *

+ * + * @since 1.5 + */ + public static final char MIN_HIGH_SURROGATE = '\uD800'; + + /** + *

+ * Maximum value of a high surrogate or leading surrogate unit in UTF-16 + * encoding - '\uDBFF'. + *

+ * + * @since 1.5 + */ + public static final char MAX_HIGH_SURROGATE = '\uDBFF'; + + /** + *

+ * Minimum value of a low surrogate or trailing surrogate unit in UTF-16 + * encoding - '\uDC00'. + *

+ * + * @since 1.5 + */ + public static final char MIN_LOW_SURROGATE = '\uDC00'; + + /** + * Maximum value of a low surrogate or trailing surrogate unit in UTF-16 + * encoding - '\uDFFF'. + *

+ * + * @since 1.5 + */ + public static final char MAX_LOW_SURROGATE = '\uDFFF'; + + /** + *

+ * Minimum value of a surrogate unit in UTF-16 encoding - '\uD800'. + *

+ * + * @since 1.5 + */ + public static final char MIN_SURROGATE = '\uD800'; + + /** + *

+ * Maximum value of a surrogate unit in UTF-16 encoding - '\uDFFF'. + *

+ * + * @since 1.5 + */ + public static final char MAX_SURROGATE = '\uDFFF'; + + /** + *

+ * Minimum value of a supplementary code point - U+0010000. + *

+ * + * @since 1.5 + */ + public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; + + /** + *

+ * Minimum code point value - U+0000. + *

+ * + * @since 1.5 + */ + public static final int MIN_CODE_POINT = 0x000000; + + /** + *

+ * Maximum code point value - U+10FFFF. + *

+ * + * @since 1.5 + */ + public static final int MAX_CODE_POINT = 0x10FFFF; + + /** + *

+ * Constant for the number of bits to represent a char in + * two's compliment form. + *

+ * + * @since 1.5 + */ + public static final int SIZE = 16; // Unicode 3.0.1 (same as Unicode 3.0.0) private static final String bidiKeys = "\u0000\t\f\u000e\u001c\u001f!#&+/1: + * Returns a Character instance for the char + * value passed. This method is preferred over the constructor, as this + * method may maintain a cache of instances. + *

+ * + * @param c The char value. + * @return A Character instance. + * @since 1.5 + */ + public static Character valueOf(char c) { + if (c > CACHE.length) + return new Character(c); + synchronized (CACHE) { + Character ch = CACHE[c]; + if (ch == null) + CACHE[c] = ch = new Character(c); + return ch; + } + } + + /** + *

+ * A test for determining if the codePoint is a valid Unicode + * code point. + *

+ * + * @param codePoint The code point to test. + * @return A boolean value. + * @since 1.5 + */ + public static boolean isValidCodePoint(int codePoint) { + return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); + } + + /** + *

+ * A test for determining if the codePoint is within the + * supplementary code point range. + *

+ * + * @param codePoint The code point to test. + * @return A boolean value. + * @since 1.5 + */ + public static boolean isSupplementaryCodePoint(int codePoint) { + return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); + } + + /** + *

+ * A test for determining if the char is a high + * surrogate/leading surrogate unit that's used for representing + * supplementary characters in UTF-16 encoding. + *

+ * + * @param ch The char unit to test. + * @return A boolean value. + * @since 1.5 + * @see #isLowSurrogate(char) + */ + public static boolean isHighSurrogate(char ch) { + return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch); + } + + /** + *

+ * A test for determining if the char is a high + * surrogate/leading surrogate unit that's used for representing + * supplementary characters in UTF-16 encoding. + *

+ * + * @param ch The char unit to test. + * @return A boolean value. + * @since 1.5 + * @see #isHighSurrogate(char) + */ + public static boolean isLowSurrogate(char ch) { + return (MIN_LOW_SURROGATE <= ch && MAX_LOW_SURROGATE >= ch); + } + + /** + *

+ * A test for determining if the char pair is a valid + * surrogate pair. + *

+ * + * @param high The high surrogate unit to test. + * @param low The low surrogate unit to test. + * @return A boolean value. + * @since 1.5 + * @see #isHighSurrogate(char) + * @see #isLowSurrogate(char) + */ + public static boolean isSurrogatePair(char high, char low) { + return (isHighSurrogate(high) && isLowSurrogate(low)); + } + + /** + *

+ * Calculates the number of char values required to represent + * the Unicode code point. This method only tests if the + * codePoint is greater than or equal to 0x10000, + * in which case 2 is returned, otherwise 1. + * To test if the code point is valid, use the + * {@link #isValidCodePoint(int)} method. + *

+ * + * @param codePoint The code point to test. + * @return An int value of 2 or 1. + * @since 1.5 + * @see #isValidCodePoint(int) + * @see #isSupplementaryCodePoint(int) + */ + public static int charCount(int codePoint) { + return (codePoint >= 0x10000 ? 2 : 1); + } + + /** + *

+ * Converts a surrogate pair into a Unicode code point. This method assume + * that the pair are valid surrogates. If the pair are NOT valid surrogates, + * then the result is indeterminate. The + * {@link #isSurrogatePair(char, char)} method should be used prior to this + * method to validate the pair. + *

+ * + * @param high The high surrogate unit. + * @param low The low surrogate unit. + * @return The decoded code point. + * @since 1.5 + * @see #isSurrogatePair(char, char) + */ + public static int toCodePoint(char high, char low) { + // See RFC 2781, Section 2.2 + // http://www.faqs.org/rfcs/rfc2781.html + int h = (high & 0x3FF) << 10; + int l = low & 0x3FF; + return (h | l) + 0x10000; + } + + /** + *

+ * Returns the code point at the index in the CharSequence. + * If char unit at the index is a high-surrogate unit, the + * next index is less than the length of the sequence and the + * char unit at the next index is a low surrogate unit, then + * the code point represented by the pair is returned; otherwise the + * char unit at the index is returned. + *

+ * + * @param seq The sequence of char units. + * @param index The index into the seq to retrieve and + * convert. + * @return The Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if the index is negative + * or greater than or equal to seq.length(). + * @since 1.5 + */ + public static int codePointAt(CharSequence seq, int index) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length(); + if (index < 0 || index >= len) + throw new IndexOutOfBoundsException(); + + char high = seq.charAt(index++); + if (index >= len) + return high; + char low = seq.charAt(index); + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return high; + } + + /** + *

+ * Returns the code point at the index in the char[]. If + * char unit at the index is a high-surrogate unit, the next + * index is less than the length of the sequence and the char + * unit at the next index is a low surrogate unit, then the code point + * represented by the pair is returned; otherwise the char + * unit at the index is returned. + *

+ * + * @param seq The sequence of char units. + * @param index The index into the seq to retrieve and + * convert. + * @return The Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if the index is negative + * or greater than or equal to seq.length(). + * @since 1.5 + */ + public static int codePointAt(char[] seq, int index) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length; + if (index < 0 || index >= len) + throw new IndexOutOfBoundsException(); + + char high = seq[index++]; + if (index >= len) + return high; + char low = seq[index]; + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return high; + } + + /** + *

+ * Returns the code point at the index in the char[] that's + * within the limit. If char unit at the index is a + * high-surrogate unit, the next index is less than the limit + * and the char unit at the next index is a low surrogate + * unit, then the code point represented by the pair is returned; otherwise + * the char unit at the index is returned. + *

+ * + * @param seq The sequence of char units. + * @param index The index into the seq to retrieve and + * convert. + * @param limit The exclusive index into the seq that marks + * the end of the units that can be used. + * @return The Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if the index is + * negative, greater than or equal to limit, + * limit is negative or limit is + * greater than the length of seq. + * @since 1.5 + */ + public static int codePointAt(char[] seq, int index, int limit) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length; + if (index < 0 || index >= limit || limit < 0 || limit > len) + throw new IndexOutOfBoundsException(); + + char high = seq[index++]; + if (index >= limit) + return high; + char low = seq[index]; + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return high; + } + + /** + *

+ * Returns the Unicode code point that proceeds the index in + * the CharSequence. If the char unit at + * index - 1 is within the low surrogate range, the value + * index - 2 isn't negative and the char unit + * at index - 2 is within the high surrogate range, then the + * supplementary code point made up of the surrogate pair is returned; + * otherwise, the char value at index - 1 is + * returned. + *

+ * + * @param seq The CharSequence to search. + * @param index The index into the seq. + * @return A Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if index is less than 1 + * or greater than seq.length(). + * @since 1.5 + */ + public static int codePointBefore(CharSequence seq, int index) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length(); + if (index < 1 || index > len) + throw new IndexOutOfBoundsException(); + + char low = seq.charAt(--index); + if (--index < 0) + return low; + char high = seq.charAt(index); + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return low; + } + + /** + *

+ * Returns the Unicode code point that proceeds the index in + * the char[]. If the char unit at + * index - 1 is within the low surrogate range, the value + * index - 2 isn't negative and the char unit + * at index - 2 is within the high surrogate range, then the + * supplementary code point made up of the surrogate pair is returned; + * otherwise, the char value at index - 1 is + * returned. + *

+ * + * @param seq The char[] to search. + * @param index The index into the seq. + * @return A Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if index is less than 1 + * or greater than seq.length. + * @since 1.5 + */ + public static int codePointBefore(char[] seq, int index) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length; + if (index < 1 || index > len) + throw new IndexOutOfBoundsException(); + + char low = seq[--index]; + if (--index < 0) + return low; + char high = seq[index]; + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return low; + } + + /** + *

+ * Returns the Unicode code point that proceeds the index in + * the char[] and isn't less than start. If + * the char unit at index - 1 is within the + * low surrogate range, the value index - 2 isn't less than + * start and the char unit at + * index - 2 is within the high surrogate range, then the + * supplementary code point made up of the surrogate pair is returned; + * otherwise, the char value at index - 1 is + * returned. + *

+ * + * @param seq The char[] to search. + * @param index The index into the seq. + * @return A Unicode code point. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if index is less than or + * equal to start, index is greater + * than seq.length, start is not + * negative and start is greater than + * seq.length. + * @since 1.5 + */ + public static int codePointBefore(char[] seq, int index, int start) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length; + if (index <= start || index > len || start < 0 || start >= len) + throw new IndexOutOfBoundsException(); + + char low = seq[--index]; + if (--index < start) + return low; + char high = seq[index]; + if (isSurrogatePair(high, low)) + return toCodePoint(high, low); + return low; + } + + /** + *

+ * Converts the Unicode code point, codePoint, into a UTF-16 + * encoded sequence and copies the value(s) into the + * char[] dst, starting at the index + * dstIndex. + *

+ * + * @param codePoint The Unicode code point to encode. + * @param dst The char[] to copy the encoded value into. + * @param dstIndex The index to start copying into dst. + * @return The number of char value units copied into + * dst. + * @throws IllegalArgumentException if codePoint is not a + * valid Unicode code point. + * @throws NullPointerException if dst is null. + * @throws IndexOutOfBoundsException if dstIndex is negative, + * greater than or equal to dst.length or equals + * dst.length - 1 when codePoint is a + * {@link #isSupplementaryCodePoint(int) supplementary code point}. + * @since 1.5 + */ + public static int toChars(int codePoint, char[] dst, int dstIndex) { + if (!isValidCodePoint(codePoint)) + throw new IllegalArgumentException(); + if (dst == null) + throw new NullPointerException(); + if (dstIndex < 0 || dstIndex >= dst.length) + throw new IndexOutOfBoundsException(); + + if (isSupplementaryCodePoint(codePoint)) { + if (dstIndex == dst.length - 1) + throw new IndexOutOfBoundsException(); + // See RFC 2781, Section 2.1 + // http://www.faqs.org/rfcs/rfc2781.html + int cpPrime = codePoint - 0x10000; + int high = 0xD800 | ((cpPrime >> 10) & 0x3FF); + int low = 0xDC00 | (cpPrime & 0x3FF); + dst[dstIndex] = (char) high; + dst[dstIndex + 1] = (char) low; + return 2; + } + + dst[dstIndex] = (char) codePoint; + return 1; + } + + /** + *

+ * Converts the Unicode code point, codePoint, into a UTF-16 + * encoded sequence that is returned as a char[]. + *

+ * + * @param codePoint The Unicode code point to encode. + * @return The UTF-16 encoded char sequence; if code point is + * a {@link #isSupplementaryCodePoint(int) supplementary code point}, + * then a 2 char array is returned, otherwise a 1 + * char array is returned. + * @throws IllegalArgumentException if codePoint is not a + * valid Unicode code point. + * @since 1.5 + */ + public static char[] toChars(int codePoint) { + if (!isValidCodePoint(codePoint)) + throw new IllegalArgumentException(); + + if (isSupplementaryCodePoint(codePoint)) { + int cpPrime = codePoint - 0x10000; + int high = 0xD800 | ((cpPrime >> 10) & 0x3FF); + int low = 0xDC00 | (cpPrime & 0x3FF); + return new char[] { (char) high, (char) low }; + } + return new char[] { (char) codePoint }; + } + + /** + *

+ * Counts the number of Unicode code points in the subsequence of the + * CharSequence, as delineated by the + * beginIndex and endIndex. Any surrogate + * values with missing pair values will be counted as 1 code point. + *

+ * + * @param seq The CharSequence to look through. + * @param beginIndex The inclusive index to begin counting at. + * @param endIndex The exclusive index to stop counting at. + * @return The number of Unicode code points. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if beginIndex is + * negative, greater than seq.length() or greater + * than endIndex. + * @since 1.5 + */ + public static int codePointCount(CharSequence seq, int beginIndex, + int endIndex) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length(); + if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) + throw new IndexOutOfBoundsException(); + + int result = 0; + for (int i = beginIndex; i < endIndex; i++) { + char c = seq.charAt(i); + if (isHighSurrogate(c)) { + if (++i < endIndex) { + c = seq.charAt(i); + if (!isLowSurrogate(c)) + result++; + } + } + result++; + } + return result; + } + + /** + *

+ * Counts the number of Unicode code points in the subsequence of the + * char[], as delineated by the offset and + * count. Any surrogate values with missing pair values will + * be counted as 1 code point. + *

+ * + * @param seq The char[] to look through. + * @param offset The inclusive index to begin counting at. + * @param count The number of char values to look through in + * seq. + * @return The number of Unicode code points. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if offset or + * count is negative or if endIndex is + * greater than seq.length. + * @since 1.5 + */ + public static int codePointCount(char[] seq, int offset, int count) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length; + int endIndex = offset + count; + if (offset < 0 || count < 0 || endIndex > len) + throw new IndexOutOfBoundsException(); + + int result = 0; + for (int i = offset; i < endIndex; i++) { + char c = seq[i]; + if (isHighSurrogate(c)) { + if (++i < endIndex) { + c = seq[i]; + if (!isLowSurrogate(c)) + result++; + } + } + result++; + } + return result; + } + + /** + *

+ * Determines the index into the CharSequence that is offset + * (measured in code points and specified by codePointOffset), + * from the index argument. + *

+ * + * @param seq The CharSequence to find the index within. + * @param index The index to begin from, within the + * CharSequence. + * @param codePointOffset The number of code points to look back or + * forwards; may be a negative or positive value. + * @return The calculated index that is codePointOffset code + * points from index. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if index is negative, + * greater than seq.length(), there aren't enough + * values in seq after index or before + * index if codePointOffset is + * negative. + * @since 1.5 + */ + public static int offsetByCodePoints(CharSequence seq, int index, + int codePointOffset) { + if (seq == null) + throw new NullPointerException(); + int len = seq.length(); + if (index < 0 || index > len) + throw new IndexOutOfBoundsException(); + + if (codePointOffset == 0) + return index; + + if (codePointOffset > 0) { + int codePoints = codePointOffset; + int i = index; + while (codePoints > 0) { + codePoints--; + if (i >= len) + throw new IndexOutOfBoundsException(); + if (isHighSurrogate(seq.charAt(i))) { + int next = i + 1; + if (next < len && isLowSurrogate(seq.charAt(next))) + i++; + } + i++; + } + return i; + } + + assert codePointOffset < 0; + int codePoints = -codePointOffset; + int i = index; + while (codePoints > 0) { + codePoints--; + i--; + if (i < 0) + throw new IndexOutOfBoundsException(); + if (isLowSurrogate(seq.charAt(i))) { + int prev = i - 1; + if (prev >= 0 && isHighSurrogate(seq.charAt(prev))) + i--; + } + } + return i; + } + + /** + *

+ * Determines the index into the char[] that is offset + * (measured in code points and specified by codePointOffset), + * from the index argument and is within the subsequence as + * delineated by start and count. + *

+ * + * @param seq The char[] to find the index within. + * + * @param index The index to begin from, within the char[]. + * @param codePointOffset The number of code points to look back or + * forwards; may be a negative or positive value. + * @param start The inclusive index that marks the beginning of the + * subsequence. + * @param count The number of char values to include within + * the subsequence. + * @return The calculated index that is codePointOffset code + * points from index. + * @throws NullPointerException if seq is null. + * @throws IndexOutOfBoundsException if start or + * count is negative, start + count + * greater than seq.length, index is + * less than start, index is greater + * than start + count or there aren't enough values + * in seq after index or before + * index if codePointOffset is + * negative. + * @since 1.5 + */ + public static int offsetByCodePoints(char[] seq, int start, int count, + int index, int codePointOffset) { + if (seq == null) + throw new NullPointerException(); + int end = start + count; + if (start < 0 || count < 0 || end > seq.length || index < start + || index > end) + throw new IndexOutOfBoundsException(); + + if (codePointOffset == 0) + return index; + + if (codePointOffset > 0) { + int codePoints = codePointOffset; + int i = index; + while (codePoints > 0) { + codePoints--; + if (i >= end) + throw new IndexOutOfBoundsException(); + if (isHighSurrogate(seq[i])) { + int next = i + 1; + if (next < end && isLowSurrogate(seq[next])) + i++; + } + i++; + } + return i; + } + + assert codePointOffset < 0; + int codePoints = -codePointOffset; + int i = index; + while (codePoints > 0) { + codePoints--; + i--; + if (i < start) + throw new IndexOutOfBoundsException(); + if (isLowSurrogate(seq[i])) { + int prev = i - 1; + if (prev >= start && isHighSurrogate(seq[prev])) + i--; + } + } + return i; + } /** - * Convenience method to determine the value of character c - * in the supplied radix. The value of radix is must be - * between MIN_RADIX and MAX_RADIX inclusive. - * - * @param c - * the character - * @param radix - * the radix - * @return if radix lies between {@link #MIN_RADIX} and - * {@link #MAX_RADIX} then the value of the character in the radix, - * otherwise -1. - */ + * Convenience method to determine the value of character c + * in the supplied radix. The value of radix is must be + * between MIN_RADIX and MAX_RADIX inclusive. + * + * @param c the character + * @param radix the radix + * @return if radix lies between {@link #MIN_RADIX} and + * {@link #MAX_RADIX} then the value of the character in the radix, + * otherwise -1. + */ public static int digit(char c, int radix) { if (radix >= MIN_RADIX && radix <= MAX_RADIX) { if (c < 128) {