Index: /luni/src/main/java/java/lang/Character.java =================================================================== --- /luni/src/main/java/java/lang/Character.java (revision 383745) +++ /luni/src/main/java/java/lang/Character.java (working copy) @@ -15,7 +15,6 @@ package java.lang; - import java.io.Serializable; import org.apache.harmony.luni.util.BinarySearch; @@ -20,17 +19,44 @@ import org.apache.harmony.luni.util.BinarySearch; - /** - * Characters are objects (i.e. non-base types) which represent char values. It - * also provides a number of methods for the lexicographic categorization of - * char values. + *
+ * Character is the wrapper for the primitive type char. This
+ * class also provides a number of utility methods for working with
+ * chars.
+ *
+ * Character data is based upon the Unicode Standard, 4.0. The Unicode + * specification, character tables and other information is available at http://www.unicode.org/. + *
+ * + *
+ * Unicode characters are referred to as code points. The range of valid
+ * code points is U+0000 to U+10FFFF. The Basic Multilingual Plane (BMP)
+ * is the code point range U+0000 to U+FFFF. Characters above the BMP are
+ * referred to as Supplementary Characters. On the Java platform, UTF-16
+ * encoding and char pairs are used to represent code points in
+ * the supplementary range. A pair of char values that represent
+ * a supplementary character are made up of a high surrogate with a
+ * value range of 0xD800 to 0xDBFF and a low surrogate with a value
+ * range of 0xDC00 to 0xDFFF.
+ *
+ * On the Java platform a char value represents either a single
+ * BMP code point or a UTF-16 unit that's part of a surrogate pair. The
+ * int type is used to represent all Unicode code points.
+ *
+ * Minimum value of a high surrogate or leading surrogate unit in UTF-16
+ * encoding - '\uD800'.
+ *
+ * Maximum value of a high surrogate or leading surrogate unit in UTF-16
+ * encoding - '\uDBFF'.
+ *
+ * Minimum value of a low surrogate or trailing surrogate unit in UTF-16
+ * encoding - '\uDC00'.
+ *
'\uDFFF'.
+ *
+ *
+ * @since 1.5
+ */
+ public static final char MAX_LOW_SURROGATE = '\uDFFF';
+
+ /**
+ *
+ * Minimum value of a surrogate unit in UTF-16 encoding - '\uD800'.
+ *
+ * Maximum value of a surrogate unit in UTF-16 encoding - '\uDFFF'.
+ *
+ * Minimum value of a supplementary code point - U+0010000.
+ *
+ * Minimum code point value - U+0000.
+ *
+ * Maximum code point value - U+10FFFF.
+ *
+ * Constant for the number of bits to represent a char in
+ * two's compliment form.
+ *
Character instance for the char
+ * value passed. This method is preferred over the constructor, as this
+ * method may maintain a cache of instances.
+ *
+ *
+ * @param c The char value.
+ * @return A Character instance.
+ * @since 1.5
+ */
+ public static Character valueOf(char c) {
+ if (c > CACHE.length)
+ return new Character(c);
+ synchronized (CACHE) {
+ Character ch = CACHE[c];
+ if (ch == null)
+ CACHE[c] = ch = new Character(c);
+ return ch;
+ }
+ }
+
+ /**
+ *
+ * A test for determining if the codePoint is a valid Unicode
+ * code point.
+ *
+ * A test for determining if the codePoint is within the
+ * supplementary code point range.
+ *
+ * A test for determining if the char is a high
+ * surrogate/leading surrogate unit that's used for representing
+ * supplementary characters in UTF-16 encoding.
+ *
char unit to test.
+ * @return A boolean value.
+ * @since 1.5
+ * @see #isLowSurrogate(char)
+ */
+ public static boolean isHighSurrogate(char ch) {
+ return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
+ }
+
+ /**
+ *
+ * A test for determining if the char is a high
+ * surrogate/leading surrogate unit that's used for representing
+ * supplementary characters in UTF-16 encoding.
+ *
char unit to test.
+ * @return A boolean value.
+ * @since 1.5
+ * @see #isHighSurrogate(char)
+ */
+ public static boolean isLowSurrogate(char ch) {
+ return (MIN_LOW_SURROGATE <= ch && MAX_LOW_SURROGATE >= ch);
+ }
+
+ /**
+ *
+ * A test for determining if the char pair is a valid
+ * surrogate pair.
+ *
+ * Calculates the number of char values required to represent
+ * the Unicode code point. This method only tests if the
+ * codePoint is greater than or equal to 0x10000,
+ * in which case 2 is returned, otherwise 1.
+ * To test if the code point is valid, use the
+ * {@link #isValidCodePoint(int)} method.
+ *
int value of 2 or 1.
+ * @since 1.5
+ * @see #isValidCodePoint(int)
+ * @see #isSupplementaryCodePoint(int)
+ */
+ public static int charCount(int codePoint) {
+ return (codePoint >= 0x10000 ? 2 : 1);
+ }
+
+ /**
+ * + * Converts a surrogate pair into a Unicode code point. This method assume + * that the pair are valid surrogates. If the pair are NOT valid surrogates, + * then the result is indeterminate. The + * {@link #isSurrogatePair(char, char)} method should be used prior to this + * method to validate the pair. + *
+ * + * @param high The high surrogate unit. + * @param low The low surrogate unit. + * @return The decoded code point. + * @since 1.5 + * @see #isSurrogatePair(char, char) + */ + public static int toCodePoint(char high, char low) { + // See RFC 2781, Section 2.2 + // http://www.faqs.org/rfcs/rfc2781.html + int h = (high & 0x3FF) << 10; + int l = low & 0x3FF; + return (h | l) + 0x10000; + } + + /** + *
+ * Returns the code point at the index in the CharSequence.
+ * If char unit at the index is a high-surrogate unit, the
+ * next index is less than the length of the sequence and the
+ * char unit at the next index is a low surrogate unit, then
+ * the code point represented by the pair is returned; otherwise the
+ * char unit at the index is returned.
+ *
char units.
+ * @param index The index into the seq to retrieve and
+ * convert.
+ * @return The Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if the index is negative
+ * or greater than or equal to seq.length().
+ * @since 1.5
+ */
+ public static int codePointAt(CharSequence seq, int index) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length();
+ if (index < 0 || index >= len)
+ throw new IndexOutOfBoundsException();
+
+ char high = seq.charAt(index++);
+ if (index >= len)
+ return high;
+ char low = seq.charAt(index);
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return high;
+ }
+
+ /**
+ *
+ * Returns the code point at the index in the char[]. If
+ * char unit at the index is a high-surrogate unit, the next
+ * index is less than the length of the sequence and the char
+ * unit at the next index is a low surrogate unit, then the code point
+ * represented by the pair is returned; otherwise the char
+ * unit at the index is returned.
+ *
char units.
+ * @param index The index into the seq to retrieve and
+ * convert.
+ * @return The Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if the index is negative
+ * or greater than or equal to seq.length().
+ * @since 1.5
+ */
+ public static int codePointAt(char[] seq, int index) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length;
+ if (index < 0 || index >= len)
+ throw new IndexOutOfBoundsException();
+
+ char high = seq[index++];
+ if (index >= len)
+ return high;
+ char low = seq[index];
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return high;
+ }
+
+ /**
+ *
+ * Returns the code point at the index in the char[] that's
+ * within the limit. If char unit at the index is a
+ * high-surrogate unit, the next index is less than the limit
+ * and the char unit at the next index is a low surrogate
+ * unit, then the code point represented by the pair is returned; otherwise
+ * the char unit at the index is returned.
+ *
char units.
+ * @param index The index into the seq to retrieve and
+ * convert.
+ * @param limit The exclusive index into the seq that marks
+ * the end of the units that can be used.
+ * @return The Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if the index is
+ * negative, greater than or equal to limit,
+ * limit is negative or limit is
+ * greater than the length of seq.
+ * @since 1.5
+ */
+ public static int codePointAt(char[] seq, int index, int limit) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length;
+ if (index < 0 || index >= limit || limit < 0 || limit > len)
+ throw new IndexOutOfBoundsException();
+
+ char high = seq[index++];
+ if (index >= limit)
+ return high;
+ char low = seq[index];
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return high;
+ }
+
+ /**
+ *
+ * Returns the Unicode code point that proceeds the index in
+ * the CharSequence. If the char unit at
+ * index - 1 is within the low surrogate range, the value
+ * index - 2 isn't negative and the char unit
+ * at index - 2 is within the high surrogate range, then the
+ * supplementary code point made up of the surrogate pair is returned;
+ * otherwise, the char value at index - 1 is
+ * returned.
+ *
CharSequence to search.
+ * @param index The index into the seq.
+ * @return A Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if index is less than 1
+ * or greater than seq.length().
+ * @since 1.5
+ */
+ public static int codePointBefore(CharSequence seq, int index) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length();
+ if (index < 1 || index > len)
+ throw new IndexOutOfBoundsException();
+
+ char low = seq.charAt(--index);
+ if (--index < 0)
+ return low;
+ char high = seq.charAt(index);
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return low;
+ }
+
+ /**
+ *
+ * Returns the Unicode code point that proceeds the index in
+ * the char[]. If the char unit at
+ * index - 1 is within the low surrogate range, the value
+ * index - 2 isn't negative and the char unit
+ * at index - 2 is within the high surrogate range, then the
+ * supplementary code point made up of the surrogate pair is returned;
+ * otherwise, the char value at index - 1 is
+ * returned.
+ *
char[] to search.
+ * @param index The index into the seq.
+ * @return A Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if index is less than 1
+ * or greater than seq.length.
+ * @since 1.5
+ */
+ public static int codePointBefore(char[] seq, int index) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length;
+ if (index < 1 || index > len)
+ throw new IndexOutOfBoundsException();
+
+ char low = seq[--index];
+ if (--index < 0)
+ return low;
+ char high = seq[index];
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return low;
+ }
+
+ /**
+ *
+ * Returns the Unicode code point that proceeds the index in
+ * the char[] and isn't less than start. If
+ * the char unit at index - 1 is within the
+ * low surrogate range, the value index - 2 isn't less than
+ * start and the char unit at
+ * index - 2 is within the high surrogate range, then the
+ * supplementary code point made up of the surrogate pair is returned;
+ * otherwise, the char value at index - 1 is
+ * returned.
+ *
char[] to search.
+ * @param index The index into the seq.
+ * @return A Unicode code point.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if index is less than or
+ * equal to start, index is greater
+ * than seq.length, start is not
+ * negative and start is greater than
+ * seq.length.
+ * @since 1.5
+ */
+ public static int codePointBefore(char[] seq, int index, int start) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length;
+ if (index <= start || index > len || start < 0 || start >= len)
+ throw new IndexOutOfBoundsException();
+
+ char low = seq[--index];
+ if (--index < start)
+ return low;
+ char high = seq[index];
+ if (isSurrogatePair(high, low))
+ return toCodePoint(high, low);
+ return low;
+ }
+
+ /**
+ *
+ * Converts the Unicode code point, codePoint, into a UTF-16
+ * encoded sequence and copies the value(s) into the
+ * char[] dst, starting at the index
+ * dstIndex.
+ *
char[] to copy the encoded value into.
+ * @param dstIndex The index to start copying into dst.
+ * @return The number of char value units copied into
+ * dst.
+ * @throws IllegalArgumentException if codePoint is not a
+ * valid Unicode code point.
+ * @throws NullPointerException if dst is null.
+ * @throws IndexOutOfBoundsException if dstIndex is negative,
+ * greater than or equal to dst.length or equals
+ * dst.length - 1 when codePoint is a
+ * {@link #isSupplementaryCodePoint(int) supplementary code point}.
+ * @since 1.5
+ */
+ public static int toChars(int codePoint, char[] dst, int dstIndex) {
+ if (!isValidCodePoint(codePoint))
+ throw new IllegalArgumentException();
+ if (dst == null)
+ throw new NullPointerException();
+ if (dstIndex < 0 || dstIndex >= dst.length)
+ throw new IndexOutOfBoundsException();
+
+ if (isSupplementaryCodePoint(codePoint)) {
+ if (dstIndex == dst.length - 1)
+ throw new IndexOutOfBoundsException();
+ // See RFC 2781, Section 2.1
+ // http://www.faqs.org/rfcs/rfc2781.html
+ int cpPrime = codePoint - 0x10000;
+ int high = 0xD800 | ((cpPrime >> 10) & 0x3FF);
+ int low = 0xDC00 | (cpPrime & 0x3FF);
+ dst[dstIndex] = (char) high;
+ dst[dstIndex + 1] = (char) low;
+ return 2;
+ }
+
+ dst[dstIndex] = (char) codePoint;
+ return 1;
+ }
+
+ /**
+ *
+ * Converts the Unicode code point, codePoint, into a UTF-16
+ * encoded sequence that is returned as a char[].
+ *
char sequence; if code point is
+ * a {@link #isSupplementaryCodePoint(int) supplementary code point},
+ * then a 2 char array is returned, otherwise a 1
+ * char array is returned.
+ * @throws IllegalArgumentException if codePoint is not a
+ * valid Unicode code point.
+ * @since 1.5
+ */
+ public static char[] toChars(int codePoint) {
+ if (!isValidCodePoint(codePoint))
+ throw new IllegalArgumentException();
+
+ if (isSupplementaryCodePoint(codePoint)) {
+ int cpPrime = codePoint - 0x10000;
+ int high = 0xD800 | ((cpPrime >> 10) & 0x3FF);
+ int low = 0xDC00 | (cpPrime & 0x3FF);
+ return new char[] { (char) high, (char) low };
+ }
+ return new char[] { (char) codePoint };
+ }
+
+ /**
+ *
+ * Counts the number of Unicode code points in the subsequence of the
+ * CharSequence, as delineated by the
+ * beginIndex and endIndex. Any surrogate
+ * values with missing pair values will be counted as 1 code point.
+ *
CharSequence to look through.
+ * @param beginIndex The inclusive index to begin counting at.
+ * @param endIndex The exclusive index to stop counting at.
+ * @return The number of Unicode code points.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if beginIndex is
+ * negative, greater than seq.length() or greater
+ * than endIndex.
+ * @since 1.5
+ */
+ public static int codePointCount(CharSequence seq, int beginIndex,
+ int endIndex) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length();
+ if (beginIndex < 0 || endIndex > len || beginIndex > endIndex)
+ throw new IndexOutOfBoundsException();
+
+ int result = 0;
+ for (int i = beginIndex; i < endIndex; i++) {
+ char c = seq.charAt(i);
+ if (isHighSurrogate(c)) {
+ if (++i < endIndex) {
+ c = seq.charAt(i);
+ if (!isLowSurrogate(c))
+ result++;
+ }
+ }
+ result++;
+ }
+ return result;
+ }
+
+ /**
+ *
+ * Counts the number of Unicode code points in the subsequence of the
+ * char[], as delineated by the offset and
+ * count. Any surrogate values with missing pair values will
+ * be counted as 1 code point.
+ *
char[] to look through.
+ * @param offset The inclusive index to begin counting at.
+ * @param count The number of char values to look through in
+ * seq.
+ * @return The number of Unicode code points.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if offset or
+ * count is negative or if endIndex is
+ * greater than seq.length.
+ * @since 1.5
+ */
+ public static int codePointCount(char[] seq, int offset, int count) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length;
+ int endIndex = offset + count;
+ if (offset < 0 || count < 0 || endIndex > len)
+ throw new IndexOutOfBoundsException();
+
+ int result = 0;
+ for (int i = offset; i < endIndex; i++) {
+ char c = seq[i];
+ if (isHighSurrogate(c)) {
+ if (++i < endIndex) {
+ c = seq[i];
+ if (!isLowSurrogate(c))
+ result++;
+ }
+ }
+ result++;
+ }
+ return result;
+ }
+
+ /**
+ *
+ * Determines the index into the CharSequence that is offset
+ * (measured in code points and specified by codePointOffset),
+ * from the index argument.
+ *
CharSequence to find the index within.
+ * @param index The index to begin from, within the
+ * CharSequence.
+ * @param codePointOffset The number of code points to look back or
+ * forwards; may be a negative or positive value.
+ * @return The calculated index that is codePointOffset code
+ * points from index.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if index is negative,
+ * greater than seq.length(), there aren't enough
+ * values in seq after index or before
+ * index if codePointOffset is
+ * negative.
+ * @since 1.5
+ */
+ public static int offsetByCodePoints(CharSequence seq, int index,
+ int codePointOffset) {
+ if (seq == null)
+ throw new NullPointerException();
+ int len = seq.length();
+ if (index < 0 || index > len)
+ throw new IndexOutOfBoundsException();
+
+ if (codePointOffset == 0)
+ return index;
+
+ if (codePointOffset > 0) {
+ int codePoints = codePointOffset;
+ int i = index;
+ while (codePoints > 0) {
+ codePoints--;
+ if (i >= len)
+ throw new IndexOutOfBoundsException();
+ if (isHighSurrogate(seq.charAt(i))) {
+ int next = i + 1;
+ if (next < len && isLowSurrogate(seq.charAt(next)))
+ i++;
+ }
+ i++;
+ }
+ return i;
+ }
+
+ assert codePointOffset < 0;
+ int codePoints = -codePointOffset;
+ int i = index;
+ while (codePoints > 0) {
+ codePoints--;
+ i--;
+ if (i < 0)
+ throw new IndexOutOfBoundsException();
+ if (isLowSurrogate(seq.charAt(i))) {
+ int prev = i - 1;
+ if (prev >= 0 && isHighSurrogate(seq.charAt(prev)))
+ i--;
+ }
+ }
+ return i;
+ }
+
+ /**
+ *
+ * Determines the index into the char[] that is offset
+ * (measured in code points and specified by codePointOffset),
+ * from the index argument and is within the subsequence as
+ * delineated by start and count.
+ *
char[] to find the index within.
+ *
+ * @param index The index to begin from, within the char[].
+ * @param codePointOffset The number of code points to look back or
+ * forwards; may be a negative or positive value.
+ * @param start The inclusive index that marks the beginning of the
+ * subsequence.
+ * @param count The number of char values to include within
+ * the subsequence.
+ * @return The calculated index that is codePointOffset code
+ * points from index.
+ * @throws NullPointerException if seq is null.
+ * @throws IndexOutOfBoundsException if start or
+ * count is negative, start + count
+ * greater than seq.length, index is
+ * less than start, index is greater
+ * than start + count or there aren't enough values
+ * in seq after index or before
+ * index if codePointOffset is
+ * negative.
+ * @since 1.5
+ */
+ public static int offsetByCodePoints(char[] seq, int start, int count,
+ int index, int codePointOffset) {
+ if (seq == null)
+ throw new NullPointerException();
+ int end = start + count;
+ if (start < 0 || count < 0 || end > seq.length || index < start
+ || index > end)
+ throw new IndexOutOfBoundsException();
+
+ if (codePointOffset == 0)
+ return index;
+
+ if (codePointOffset > 0) {
+ int codePoints = codePointOffset;
+ int i = index;
+ while (codePoints > 0) {
+ codePoints--;
+ if (i >= end)
+ throw new IndexOutOfBoundsException();
+ if (isHighSurrogate(seq[i])) {
+ int next = i + 1;
+ if (next < end && isLowSurrogate(seq[next]))
+ i++;
+ }
+ i++;
+ }
+ return i;
+ }
+
+ assert codePointOffset < 0;
+ int codePoints = -codePointOffset;
+ int i = index;
+ while (codePoints > 0) {
+ codePoints--;
+ i--;
+ if (i < start)
+ throw new IndexOutOfBoundsException();
+ if (isLowSurrogate(seq[i])) {
+ int prev = i - 1;
+ if (prev >= start && isHighSurrogate(seq[prev]))
+ i--;
+ }
+ }
+ return i;
+ }
/**
- * Convenience method to determine the value of character c
- * in the supplied radix. The value of radix is must be
- * between MIN_RADIX and MAX_RADIX inclusive.
- *
- * @param c
- * the character
- * @param radix
- * the radix
- * @return if radix lies between {@link #MIN_RADIX} and
- * {@link #MAX_RADIX} then the value of the character in the radix,
- * otherwise -1.
- */
+ * Convenience method to determine the value of character c
+ * in the supplied radix. The value of radix is must be
+ * between MIN_RADIX and MAX_RADIX inclusive.
+ *
+ * @param c the character
+ * @param radix the radix
+ * @return if radix lies between {@link #MIN_RADIX} and
+ * {@link #MAX_RADIX} then the value of the character in the radix,
+ * otherwise -1.
+ */
public static int digit(char c, int radix) {
if (radix >= MIN_RADIX && radix <= MAX_RADIX) {
if (c < 128) {