Index: solr/src/test/org/apache/solr/request/TestFaceting.java
===================================================================
--- solr/src/test/org/apache/solr/request/TestFaceting.java (revision 966583)
+++ solr/src/test/org/apache/solr/request/TestFaceting.java (working copy)
@@ -87,7 +87,7 @@
assertEquals(br != null, rnum < size);
if (rnum < size) {
assertEquals(rnum, te.pos);
- assertEquals(s, te.term().utf8ToString());
+ assertEquals(s, te.term().bocu1ToString());
} else {
assertEquals(null, te.term());
assertEquals(size, te.getTermNumber());
@@ -98,7 +98,7 @@
assertEquals(size>0, te.skipTo(new BytesRef("000")) != null);
assertEquals(0, te.getTermNumber());
if (size>0) {
- assertEquals(t(0), te.term().utf8ToString());
+ assertEquals(t(0), te.term().bocu1ToString());
} else {
assertEquals(null, te.term());
}
@@ -111,7 +111,7 @@
BytesRef br = te.skipTo(rnum);
assertNotNull(br);
assertEquals(rnum, te.pos);
- assertEquals(s, te.term().utf8ToString());
+ assertEquals(s, te.term().bocu1ToString());
}
}
}
Index: solr/src/java/org/apache/solr/schema/TrieDateField.java
===================================================================
--- solr/src/java/org/apache/solr/schema/TrieDateField.java (revision 966583)
+++ solr/src/java/org/apache/solr/schema/TrieDateField.java (working copy)
@@ -131,7 +131,7 @@
// TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts!
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
NumericUtils.longToPrefixCoded(super.parseMath(null, val).getTime(), 0, bytes);
- return bytes.utf8ToString();
+ return bytes.bocu1ToString();
}
@Override
Index: solr/src/java/org/apache/solr/schema/TrieField.java
===================================================================
--- solr/src/java/org/apache/solr/schema/TrieField.java (revision 966583)
+++ solr/src/java/org/apache/solr/schema/TrieField.java (working copy)
@@ -328,7 +328,7 @@
// TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts!
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
readableToIndexed(val, bytes);
- return bytes.utf8ToString();
+ return bytes.bocu1ToString();
}
@Override
Index: solr/src/java/org/apache/solr/schema/StrField.java
===================================================================
--- solr/src/java/org/apache/solr/schema/StrField.java (revision 966583)
+++ solr/src/java/org/apache/solr/schema/StrField.java (working copy)
@@ -95,7 +95,7 @@
if (ord == 0) {
return null;
} else {
- return termsIndex.lookup(ord, new BytesRef()).utf8ToString();
+ return termsIndex.lookup(ord, new BytesRef()).bocu1ToString();
}
}
Index: solr/src/java/org/apache/solr/schema/FieldType.java
===================================================================
--- solr/src/java/org/apache/solr/schema/FieldType.java (revision 966583)
+++ solr/src/java/org/apache/solr/schema/FieldType.java (working copy)
@@ -377,7 +377,7 @@
/** Given the readable value, return the term value that will match it. */
public void readableToIndexed(CharSequence val, BytesRef result) {
String internal = readableToIndexed(val.toString());
- UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result);
+ UnicodeUtil.UTF16toBOCU1(internal, 0, internal.length(), result);
}
/**
Index: solr/src/java/org/apache/solr/search/ValueSourceParser.java
===================================================================
--- solr/src/java/org/apache/solr/search/ValueSourceParser.java (revision 966583)
+++ solr/src/java/org/apache/solr/search/ValueSourceParser.java (working copy)
@@ -522,7 +522,7 @@
tinfo.indexedField = term.field();
indexedVal = term.text();
}
- UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes);
+ UnicodeUtil.UTF16toBOCU1(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes);
} else {
ft.readableToIndexed(tinfo.val, tinfo.indexedBytes);
}
Index: solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
===================================================================
--- solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java (revision 966583)
+++ solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java (working copy)
@@ -350,7 +350,7 @@
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
- queue.add(new SimpleFacets.CountPair This class can also be used, to generate lexicographically sortable (according to
- * {@link BytesRef#getUTF8SortedAsUTF16Comparator()}) representations of numeric data
+ * {@link BytesRef#getBOCU1SortedAsUTF16Comparator()}) representations of numeric data
* types for other usages (e.g. sorting).
*
* @lucene.internal
Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 966583)
+++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy)
@@ -79,7 +79,7 @@
}
/**
- * @param text Initialize the byte[] from the UTF8 bytes
+ * @param text Initialize the byte[] from the BOCU1 bytes
* for the provided array. This must be well-formed
* unicode text, with no unpaired surrogates or U+FFFF.
*/
@@ -107,23 +107,23 @@
*/
/**
- * Copies the UTF8 bytes for this string.
+ * Copies the BOCU1 bytes for this string.
*
* @param text Must be well-formed unicode text, with no
* unpaired surrogates or invalid UTF16 code units.
*/
public void copy(CharSequence text) {
- UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
+ UnicodeUtil.UTF16toBOCU1(text, 0, text.length(), this);
}
/**
- * Copies the UTF8 bytes for this string.
+ * Copies the BOCU1 bytes for this string.
*
* @param text Must be well-formed unicode text, with no
* unpaired surrogates or invalid UTF16 code units.
*/
public void copy(char text[], int offset, int length) {
- UnicodeUtil.UTF16toUTF8(text, offset, length, this);
+ UnicodeUtil.UTF16toBOCU1(text, offset, length, this);
}
public boolean bytesEquals(BytesRef other) {
if (length == other.length) {
@@ -198,13 +198,19 @@
/** Interprets stored bytes as UTF8 bytes, returning the
* resulting string */
+ public String bocu1ToString() {
+ UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
+ UnicodeUtil.BOCU1toUTF16(bytes, offset, length, result);
+ return result.toString();
+ }
+
+ /** @deprecated do your string/byte conversion some other way */
+ @Deprecated
public String utf8ToString() {
try {
return new String(bytes, offset, length, "UTF-8");
- } catch (UnsupportedEncodingException uee) {
- // should not happen -- UTF8 is presumably supported
- // by all JREs
- throw new RuntimeException(uee);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
}
}
@@ -258,15 +264,15 @@
return this.length - other.length;
}
- private final static Comparator
+ * Standalone utility class providing UTF16 character conversions and indexing conversions.
+ *
+ * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
+ * so searching for strings is a safe operation. Similarly, concatenation is always safe.
+ * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
+ * values for start and end are on those boundaries, since they arose from operations like
+ * searching. If not, the nearest UTF-32 boundaries can be determined using
+ * The following examples illustrate use of some of these methods.
+ *
+ *
+ * To find the UTF-32 length of a string, use:
+ *
+ *
+ *
+ * @param source Text to analyse
+ * @param offset16 UTF-16 offset < source text length.
+ * @return UTF-32 offset
+ * @exception IndexOutOfBoundsException If offset16 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int findCodePointOffset(String source, int offset16) {
+ if (offset16 < 0 || offset16 > source.length()) {
+ throw new StringIndexOutOfBoundsException(offset16);
+ }
+
+ int result = 0;
+ char ch;
+ boolean hadLeadSurrogate = false;
+
+ for (int i = 0; i < offset16; ++i) {
+ ch = source.charAt(i);
+ if (hadLeadSurrogate && isTrailSurrogate(ch)) {
+ hadLeadSurrogate = false; // count valid trail as zero
+ } else {
+ hadLeadSurrogate = isLeadSurrogate(ch);
+ ++result; // count others as 1
+ }
+ }
+
+ if (offset16 == source.length()) {
+ return result;
+ }
+
+ // end of source being the less significant surrogate character
+ // shift result back to the start of the supplementary character
+ if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
+ result--;
+ }
+
+ return result;
+ }
+
+ /**
+ * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
+ * offset. Used for random access. See the class description for notes on
+ * roundtripping.
+ * To find the UTF-32 length of a string, use:
+ *
+ * bounds().
+ *
+ * // iteration forwards: Original
+ * for (int i = 0; i < s.length(); ++i) {
+ * char ch = s.charAt(i);
+ * doSomethingWith(ch);
+ * }
+ *
+ * // iteration forwards: Changes for UTF-32
+ * int ch;
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
+ * doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Original
+ * for (int i = s.length() - 1; i >= 0; --i) {
+ * char ch = s.charAt(i);
+ * doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Changes for UTF-32
+ * int ch;
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
+ * doSomethingWith(ch);
+ * }
+ *
+ *
+ * Notes:
+ *
+ *
+ *
+ * @author Mark Davis, with help from Markus Scherer
+ * @stable ICU 2.1
+ */
+
+public final class UTF16 {
+ // public variables ---------------------------------------------------
+
+ /**
+ * Value returned in Lead
+ * and Trail in the API, which gives a better sense of their ordering in a string.
+ * offset16 and offset32 are used to distinguish offsets to UTF-16
+ * boundaries vs offsets to UTF-32 boundaries. int char32 is used to contain UTF-32
+ * characters, as opposed to char16, which is a UTF-16 code unit. bounds(string, offset16) != TRAIL.
+ * UCharacter.isLegal() can be used to
+ * check for validity if desired.
+ * bounds().
+ * These values are chosen specifically so that it actually represents the position of the
+ * character [offset16 - (value >> 2), offset16 + (value & 3)]
+ *
+ * @stable ICU 2.1
+ */
+ public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
+ TRAIL_SURROGATE_BOUNDARY = 5;
+
+ /**
+ * The lowest Unicode code point value.
+ *
+ * @stable ICU 2.1
+ */
+ public static final int CODEPOINT_MIN_VALUE = 0;
+
+ /**
+ * The highest Unicode code point value (scalar value) according to the Unicode Standard.
+ *
+ * @stable ICU 2.1
+ */
+ public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
+
+ /**
+ * The minimum value for Supplementary code points
+ *
+ * @stable ICU 2.1
+ */
+ public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
+
+ /**
+ * Lead surrogate minimum value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
+
+ /**
+ * Trail surrogate minimum value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
+
+ /**
+ * Lead surrogate maximum value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
+
+ /**
+ * Trail surrogate maximum value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
+
+ /**
+ * Surrogate minimum value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
+
+ /**
+ * Maximum surrogate value
+ *
+ * @stable ICU 2.1
+ */
+ public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
+
+ /**
+ * Lead surrogate bitmask
+ */
+ private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
+
+ /**
+ * Trail surrogate bitmask
+ */
+ private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
+
+ /**
+ * Surrogate bitmask
+ */
+ private static final int SURROGATE_BITMASK = 0xFFFFF800;
+
+ /**
+ * Lead surrogate bits
+ */
+ private static final int LEAD_SURROGATE_BITS = 0xD800;
+
+ /**
+ * Trail surrogate bits
+ */
+ private static final int TRAIL_SURROGATE_BITS = 0xDC00;
+
+ /**
+ * Surrogate bits
+ */
+ private static final int SURROGATE_BITS = 0xD800;
+
+ // constructor --------------------------------------------------------
+
+ // /CLOVER:OFF
+ /**
+ * Prevent instance from being created.
+ */
+ private UTF16() {
+ }
+
+ // /CLOVER:ON
+ // public method ------------------------------------------------------
+
+ /**
+ * Determines how many chars this char32 requires. If a validity check is required, use
+ * isLegal()
+ * on char32 before calling.
+ *
+ * @param char32 The input codepoint.
+ * @return 2 if is in supplementary space, otherwise 1.
+ * @stable ICU 2.1
+ */
+ public static int getCharCount(int char32) {
+ if (char32 < SUPPLEMENTARY_MIN_VALUE) {
+ return 1;
+ }
+ return 2;
+ }
+
+ /**
+ * Returns the type of the boundaries around the char at offset16. Used for random access.
+ *
+ * @param source Text to analyse
+ * @param offset16 UTF-16 offset
+ * @return
+ *
+ *
+ * For bit-twiddlers, the return values for these are chosen so that the boundaries
+ * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
+ * @exception IndexOutOfBoundsException If offset16 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int bounds(String source, int offset16) {
+ char ch = source.charAt(offset16);
+ if (isSurrogate(ch)) {
+ if (isLeadSurrogate(ch)) {
+ if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
+ return LEAD_SURROGATE_BOUNDARY;
+ }
+ } else {
+ // isTrailSurrogate(ch), so
+ --offset16;
+ if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
+ return TRAIL_SURROGATE_BOUNDARY;
+ }
+ }
+ }
+ return SINGLE_CHAR_BOUNDARY;
+ }
+
+ /**
+ * Returns the type of the boundaries around the char at offset16. Used for random access.
+ *
+ * @param source String buffer to analyse
+ * @param offset16 UTF16 offset
+ * @return
+ *
+ *
+ * For bit-twiddlers, the return values for these are chosen so that the boundaries
+ * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
+ * @exception IndexOutOfBoundsException If offset16 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int bounds(StringBuffer source, int offset16) {
+ char ch = source.charAt(offset16);
+ if (isSurrogate(ch)) {
+ if (isLeadSurrogate(ch)) {
+ if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
+ return LEAD_SURROGATE_BOUNDARY;
+ }
+ } else {
+ // isTrailSurrogate(ch), so
+ --offset16;
+ if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
+ return TRAIL_SURROGATE_BOUNDARY;
+ }
+ }
+ }
+ return SINGLE_CHAR_BOUNDARY;
+ }
+
+ /**
+ * Returns the type of the boundaries around the char at offset16. Used for random access. Note
+ * that the boundaries are determined with respect to the subarray, hence the char array
+ * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
+ *
+ * @param source Char array to analyse
+ * @param start Offset to substring in the source array for analyzing
+ * @param limit Offset to substring in the source array for analyzing
+ * @param offset16 UTF16 offset relative to start
+ * @return
+ *
+ *
+ * For bit-twiddlers, the boundary values for these are chosen so that the boundaries
+ * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
+ * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
+ * @stable ICU 2.1
+ */
+ public static int bounds(char source[], int start, int limit, int offset16) {
+ offset16 += start;
+ if (offset16 < start || offset16 >= limit) {
+ throw new ArrayIndexOutOfBoundsException(offset16);
+ }
+ char ch = source[offset16];
+ if (isSurrogate(ch)) {
+ if (isLeadSurrogate(ch)) {
+ ++offset16;
+ if (offset16 < limit && isTrailSurrogate(source[offset16])) {
+ return LEAD_SURROGATE_BOUNDARY;
+ }
+ } else { // isTrailSurrogate(ch), so
+ --offset16;
+ if (offset16 >= start && isLeadSurrogate(source[offset16])) {
+ return TRAIL_SURROGATE_BOUNDARY;
+ }
+ }
+ }
+ return SINGLE_CHAR_BOUNDARY;
+ }
+
+ /**
+ * Determines whether the code value is a surrogate.
+ *
+ * @param char16 The input character.
+ * @return true If the input character is a surrogate.
+ * @stable ICU 2.1
+ */
+ public static boolean isSurrogate(char char16) {
+ return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
+ }
+
+ /**
+ * Determines whether the character is a trail surrogate.
+ *
+ * @param char16 The input character.
+ * @return true If the input character is a trail surrogate.
+ * @stable ICU 2.1
+ */
+ public static boolean isTrailSurrogate(char char16) {
+ return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
+ }
+
+ /**
+ * Determines whether the character is a lead surrogate.
+ *
+ * @param char16 The input character.
+ * @return true If the input character is a lead surrogate
+ * @stable ICU 2.1
+ */
+ public static boolean isLeadSurrogate(char char16) {
+ return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
+ }
+
+ /**
+ * Returns the lead surrogate. If a validity check is required, use
+ * isLegal() on char32
+ * before calling.
+ *
+ * @param char32 The input character.
+ * @return lead surrogate if the getCharCount(ch) is 2;
+ * and 0 otherwise (note: 0 is not a valid lead surrogate).
+ * @stable ICU 2.1
+ */
+ public static char getLeadSurrogate(int char32) {
+ if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
+ return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
+ }
+ return 0;
+ }
+
+ /**
+ * Returns the trail surrogate. If a validity check is required, use
+ * isLegal() on char32
+ * before calling.
+ *
+ * @param char32 The input character.
+ * @return the trail surrogate if the getCharCount(ch) is 2;
+ * otherwise the character itself
+ * @stable ICU 2.1
+ */
+ public static char getTrailSurrogate(int char32) {
+ if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
+ return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
+ }
+ return (char) char32;
+ }
+
+ /**
+ * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
+ * containing the UTF-32 value in UTF16 format. If a validity check is required, use isLegal() on char32 before calling.
+ *
+ * @param char32 The input character.
+ * @return string value of char32 in UTF16 format
+ * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
+ * @stable ICU 2.1
+ */
+ public static String valueOf(int char32) {
+ if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
+ throw new IllegalArgumentException("Illegal codepoint");
+ }
+ return toString(char32);
+ }
+
+ /**
+ * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
+ * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
+ * character, the whole supplementary codepoint will be returned. If a validity check is
+ * required, use isLegal() on the
+ * codepoint at offset16 before calling. The result returned will be a newly created String
+ * obtained by calling source.substring(..) with the appropriate indexes.
+ *
+ * @param source The input string.
+ * @param offset16 The UTF16 index to the codepoint in source
+ * @return string value of char32 in UTF16 format
+ * @stable ICU 2.1
+ */
+ public static String valueOf(String source, int offset16) {
+ switch (bounds(source, offset16)) {
+ case LEAD_SURROGATE_BOUNDARY:
+ return source.substring(offset16, offset16 + 2);
+ case TRAIL_SURROGATE_BOUNDARY:
+ return source.substring(offset16 - 1, offset16 + 1);
+ default:
+ return source.substring(offset16, offset16 + 1);
+ }
+ }
+
+ /**
+ * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
+ * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
+ * surrogate character, the whole supplementary codepoint will be returned. If a validity check
+ * is required, use isLegal() on
+ * the codepoint at offset16 before calling. The result returned will be a newly created String
+ * obtained by calling source.substring(..) with the appropriate indexes.
+ *
+ * @param source The input string buffer.
+ * @param offset16 The UTF16 index to the codepoint in source
+ * @return string value of char32 in UTF16 format
+ * @stable ICU 2.1
+ */
+ public static String valueOf(StringBuffer source, int offset16) {
+ switch (bounds(source, offset16)) {
+ case LEAD_SURROGATE_BOUNDARY:
+ return source.substring(offset16, offset16 + 2);
+ case TRAIL_SURROGATE_BOUNDARY:
+ return source.substring(offset16 - 1, offset16 + 1);
+ default:
+ return source.substring(offset16, offset16 + 1);
+ }
+ }
+
+ /**
+ * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
+ * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
+ * returned, except when either the leading or trailing surrogate character lies out of the
+ * specified subarray. In the latter case, only the surrogate character within bounds will be
+ * returned. If a validity check is required, use isLegal() on the codepoint at
+ * offset16 before calling. The result returned will be a newly created String containing the
+ * relevant characters.
+ *
+ * @param source The input char array.
+ * @param start Start index of the subarray
+ * @param limit End index of the subarray
+ * @param offset16 The UTF16 index to the codepoint in source relative to start
+ * @return string value of char32 in UTF16 format
+ * @stable ICU 2.1
+ */
+ public static String valueOf(char source[], int start, int limit, int offset16) {
+ switch (bounds(source, start, limit, offset16)) {
+ case LEAD_SURROGATE_BOUNDARY:
+ return new String(source, start + offset16, 2);
+ case TRAIL_SURROGATE_BOUNDARY:
+ return new String(source, start + offset16 - 1, 2);
+ }
+ return new String(source, start + offset16, 1);
+ }
+
+ /**
+ * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
+ * the class description for notes on roundtripping.
+ *
+ * @param source The UTF-16 string
+ * @param offset32 UTF-32 offset
+ * @return UTF-16 offset
+ * @exception IndexOutOfBoundsException If offset32 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int findOffsetFromCodePoint(String source, int offset32) {
+ char ch;
+ int size = source.length(), result = 0, count = offset32;
+ if (offset32 < 0 || offset32 > size) {
+ throw new StringIndexOutOfBoundsException(offset32);
+ }
+ while (result < size && count > 0) {
+ ch = source.charAt(result);
+ if (isLeadSurrogate(ch) && ((result + 1) < size)
+ && isTrailSurrogate(source.charAt(result + 1))) {
+ result++;
+ }
+
+ count--;
+ result++;
+ }
+ if (count != 0) {
+ throw new StringIndexOutOfBoundsException(offset32);
+ }
+ return result;
+ }
+
+ /**
+ * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
+ * the class description for notes on roundtripping.
+ *
+ * @param source The UTF-16 string buffer
+ * @param offset32 UTF-32 offset
+ * @return UTF-16 offset
+ * @exception IndexOutOfBoundsException If offset32 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
+ char ch;
+ int size = source.length(), result = 0, count = offset32;
+ if (offset32 < 0 || offset32 > size) {
+ throw new StringIndexOutOfBoundsException(offset32);
+ }
+ while (result < size && count > 0) {
+ ch = source.charAt(result);
+ if (isLeadSurrogate(ch) && ((result + 1) < size)
+ && isTrailSurrogate(source.charAt(result + 1))) {
+ result++;
+ }
+
+ count--;
+ result++;
+ }
+ if (count != 0) {
+ throw new StringIndexOutOfBoundsException(offset32);
+ }
+ return result;
+ }
+
+ /**
+ * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
+ * the class description for notes on roundtripping.
+ *
+ * @param source The UTF-16 char array whose substring is to be analysed
+ * @param start Offset of the substring to be analysed
+ * @param limit Offset of the substring to be analysed
+ * @param offset32 UTF-32 offset relative to start
+ * @return UTF-16 offset relative to start
+ * @exception IndexOutOfBoundsException If offset32 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
+ char ch;
+ int result = start, count = offset32;
+ if (offset32 > limit - start) {
+ throw new ArrayIndexOutOfBoundsException(offset32);
+ }
+ while (result < limit && count > 0) {
+ ch = source[result];
+ if (isLeadSurrogate(ch) && ((result + 1) < limit)
+ && isTrailSurrogate(source[result + 1])) {
+ result++;
+ }
+
+ count--;
+ result++;
+ }
+ if (count != 0) {
+ throw new ArrayIndexOutOfBoundsException(offset32);
+ }
+ return result - start;
+ }
+
+ /**
+ * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
+ * UTF-16 offset. Used for random access. See the class description for
+ * notes on roundtripping.
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
+ * of the lead of the pair is returned.
+ *
+ * len32 = countCodePoint(source, source.length());
+ *
+ *
+ *
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
+ * of the lead of the pair is returned.
+ *
+ * len32 = countCodePoint(source);
+ *
+ *
+ *
+ *
+ * @param source Text to analyse
+ * @param offset16 UTF-16 offset < source text length.
+ * @return UTF-32 offset
+ * @exception IndexOutOfBoundsException If offset16 is out of bounds.
+ * @stable ICU 2.1
+ */
+ public static int findCodePointOffset(StringBuffer source, int offset16) {
+ if (offset16 < 0 || offset16 > source.length()) {
+ throw new StringIndexOutOfBoundsException(offset16);
+ }
+
+ int result = 0;
+ char ch;
+ boolean hadLeadSurrogate = false;
+
+ for (int i = 0; i < offset16; ++i) {
+ ch = source.charAt(i);
+ if (hadLeadSurrogate && isTrailSurrogate(ch)) {
+ hadLeadSurrogate = false; // count valid trail as zero
+ } else {
+ hadLeadSurrogate = isLeadSurrogate(ch);
+ ++result; // count others as 1
+ }
+ }
+
+ if (offset16 == source.length()) {
+ return result;
+ }
+
+ // end of source being the less significant surrogate character
+ // shift result back to the start of the supplementary character
+ if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
+ result--;
+ }
+
+ return result;
+ }
+
+ /**
+ * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
+ * offset. Used for random access. See the class description for notes on
+ * roundtripping.
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
+ * of the lead of the pair is returned.
+ *
+ * To find the UTF-32 length of a substring, use: + * + *
+ * len32 = countCodePoint(source, start, limit); + *+ * + * + *
+ * + * @param source Text to analyse + * @param start Offset of the substring + * @param limit Offset of the substring + * @param offset16 UTF-16 relative to start + * @return UTF-32 offset relative to start + * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. + * @stable ICU 2.1 + */ + public static int findCodePointOffset(char source[], int start, int limit, int offset16) { + offset16 += start; + if (offset16 > limit) { + throw new StringIndexOutOfBoundsException(offset16); + } + + int result = 0; + char ch; + boolean hadLeadSurrogate = false; + + for (int i = start; i < offset16; ++i) { + ch = source[i]; + if (hadLeadSurrogate && isTrailSurrogate(ch)) { + hadLeadSurrogate = false; // count valid trail as zero + } else { + hadLeadSurrogate = isLeadSurrogate(ch); + ++result; // count others as 1 + } + } + + if (offset16 == limit) { + return result; + } + + // end of source being the less significant surrogate character + // shift result back to the start of the supplementary character + if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { + result--; + } + + return result; + } + + /** + * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, + * use isLegal() on char32 before + * calling. + * + * @param target The buffer to append to + * @param char32 Value to append. + * @return the updated StringBuffer + * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints + * @stable ICU 2.1 + */ + public static StringBuffer append(StringBuffer target, int char32) { + // Check for irregular values + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); + } + + // Write the UTF-16 values + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + target.append(getLeadSurrogate(char32)); + target.append(getTrailSurrogate(char32)); + } else { + target.append((char) char32); + } + return target; + } + + /** + * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a + * convenience. + * + * @param target The buffer to append to + * @param cp The code point to append + * @return the updated StringBuffer + * @throws IllegalArgumentException If cp is not a valid code point + * @stable ICU 3.0 + */ + public static StringBuffer appendCodePoint(StringBuffer target, int cp) { + return append(target, cp); + } + + /** + * Adds a codepoint to offset16 position of the argument char array. + * + * @param target Char array to be append with the new code point + * @param limit UTF16 offset which the codepoint will be appended. + * @param char32 Code point to be appended + * @return offset after char32 in the array. + * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not + * lie within the range of the Unicode codepoints. + * @stable ICU 2.1 + */ + public static int append(char[] target, int limit, int char32) { + // Check for irregular values + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint"); + } + // Write the UTF-16 values + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + target[limit++] = getLeadSurrogate(char32); + target[limit++] = getTrailSurrogate(char32); + } else { + target[limit++] = (char) char32; + } + return limit; + } + + /** + * Number of codepoints in a UTF16 String + * + * @param source UTF16 string + * @return number of codepoint in string + * @stable ICU 2.1 + */ + public static int countCodePoint(String source) { + if (source == null || source.length() == 0) { + return 0; + } + return findCodePointOffset(source, source.length()); + } + + /** + * Number of codepoints in a UTF16 String buffer + * + * @param source UTF16 string buffer + * @return number of codepoint in string + * @stable ICU 2.1 + */ + public static int countCodePoint(StringBuffer source) { + if (source == null || source.length() == 0) { + return 0; + } + return findCodePointOffset(source, source.length()); + } + + /** + * Number of codepoints in a UTF16 char array substring + * + * @param source UTF16 char array + * @param start Offset of the substring + * @param limit Offset of the substring + * @return number of codepoint in the substring + * @exception IndexOutOfBoundsException If start and limit are not valid. + * @stable ICU 2.1 + */ + public static int countCodePoint(char source[], int start, int limit) { + if (source == null || source.length == 0) { + return 0; + } + return findCodePointOffset(source, start, limit, limit - start); + } + + /** + * Set a code point into a UTF16 position. Adjusts target according if we are replacing a + * non-supplementary codepoint with a supplementary and vice versa. + * + * @param target Stringbuffer + * @param offset16 UTF16 position to insert into + * @param char32 Code point + * @stable ICU 2.1 + */ + public static void setCharAt(StringBuffer target, int offset16, int char32) { + int count = 1; + char single = target.charAt(offset16); + + if (isSurrogate(single)) { + // pairs of the surrogate with offset16 at the lead char found + if (isLeadSurrogate(single) && (target.length() > offset16 + 1) + && isTrailSurrogate(target.charAt(offset16 + 1))) { + count++; + } else { + // pairs of the surrogate with offset16 at the trail char + // found + if (isTrailSurrogate(single) && (offset16 > 0) + && isLeadSurrogate(target.charAt(offset16 - 1))) { + offset16--; + count++; + } + } + } + target.replace(offset16, offset16 + count, valueOf(char32)); + } + + /** + * Set a code point into a UTF16 position in a char array. Adjusts target according if we are + * replacing a non-supplementary codepoint with a supplementary and vice versa. + * + * @param target char array + * @param limit numbers of valid chars in target, different from target.length. limit counts the + * number of chars in target that represents a string, not the size of array target. + * @param offset16 UTF16 position to insert into + * @param char32 code point + * @return new number of chars in target that represents a string + * @exception IndexOutOfBoundsException if offset16 is out of range + * @stable ICU 2.1 + */ + public static int setCharAt(char target[], int limit, int offset16, int char32) { + if (offset16 >= limit) { + throw new ArrayIndexOutOfBoundsException(offset16); + } + int count = 1; + char single = target[offset16]; + + if (isSurrogate(single)) { + // pairs of the surrogate with offset16 at the lead char found + if (isLeadSurrogate(single) && (target.length > offset16 + 1) + && isTrailSurrogate(target[offset16 + 1])) { + count++; + } else { + // pairs of the surrogate with offset16 at the trail char + // found + if (isTrailSurrogate(single) && (offset16 > 0) + && isLeadSurrogate(target[offset16 - 1])) { + offset16--; + count++; + } + } + } + + String str = valueOf(char32); + int result = limit; + int strlength = str.length(); + target[offset16] = str.charAt(0); + if (count == strlength) { + if (count == 2) { + target[offset16 + 1] = str.charAt(1); + } + } else { + // this is not exact match in space, we'll have to do some + // shifting + System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit + - (offset16 + count)); + if (count < strlength) { + // char32 is a supplementary character trying to squeeze into + // a non-supplementary space + target[offset16 + 1] = str.charAt(1); + result++; + if (result < target.length) { + target[result] = 0; + } + } else { + // char32 is a non-supplementary character trying to fill + // into a supplementary space + result--; + target[result] = 0; + } + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints + * + * @param source string + * @param offset16 UTF16 position to shift + * @param shift32 number of codepoints to shift + * @return new shifted offset16 + * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(String source, int offset16, int shift32) { + int result = offset16; + int size = source.length(); + int count; + char ch; + if (offset16 < 0 || offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + count = shift32; + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + count--; + result++; + } + } else { + if (offset16 + shift32 < 0) { + throw new StringIndexOutOfBoundsException(offset16); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < 0) { + break; + } + ch = source.charAt(result); + if (isTrailSurrogate(ch) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints + * + * @param source String buffer + * @param offset16 UTF16 position to shift + * @param shift32 Number of codepoints to shift + * @return new shifted offset16 + * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { + int result = offset16; + int size = source.length(); + int count; + char ch; + if (offset16 < 0 || offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + count = shift32; + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + count--; + result++; + } + } else { + if (offset16 + shift32 < 0) { + throw new StringIndexOutOfBoundsException(offset16); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < 0) { + break; + } + ch = source.charAt(result); + if (isTrailSurrogate(ch) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints within a subarray. + * + * @param source Char array + * @param start Position of the subarray to be performed on + * @param limit Position of the subarray to be performed on + * @param offset16 UTF16 position to shift relative to start + * @param shift32 Number of codepoints to shift + * @return new shifted offset16 relative to start + * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the + * subarray bounds are out of range. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(char source[], int start, int limit, int offset16, + int shift32) { + int size = source.length; + int count; + char ch; + int result = offset16 + start; + if (start < 0 || limit < start) { + throw new StringIndexOutOfBoundsException(start); + } + if (limit > size) { + throw new StringIndexOutOfBoundsException(limit); + } + if (offset16 < 0 || result > limit) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + result > size) { + throw new StringIndexOutOfBoundsException(result); + } + count = shift32; + while (result < limit && count > 0) { + ch = source[result]; + if (isLeadSurrogate(ch) && (result + 1 < limit) + && isTrailSurrogate(source[result + 1])) { + result++; + } + count--; + result++; + } + } else { + if (result + shift32 < start) { + throw new StringIndexOutOfBoundsException(result); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < start) { + break; + } + ch = source[result]; + if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + result -= start; + return result; + } + + /** + * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the + * middle of a supplementary codepoint, char32 will be inserted after the supplementary + * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 + * otherwise. + *
+ * The overall effect is exactly as if the argument were converted to a string by the method + * valueOf(char) and the characters in that string were then inserted into target at the + * position indicated by offset16. + *
+ *+ * The offset argument must be greater than or equal to 0, and less than or equal to the length + * of source. + * + * @param target String buffer to insert to + * @param offset16 Offset which char32 will be inserted in + * @param char32 Codepoint to be inserted + * @return a reference to target + * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. + * @stable ICU 2.1 + */ + public static StringBuffer insert(StringBuffer target, int offset16, int char32) { + String str = valueOf(char32); + if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { + offset16++; + } + target.insert(offset16, str); + return target; + } + + /** + * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the + * middle of a supplementary codepoint, char32 will be inserted after the supplementary + * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. + *
+ * The overall effect is exactly as if the argument were converted to a string by the method + * valueOf(char) and the characters in that string were then inserted into target at the + * position indicated by offset16. + *
+ *
+ * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
+ *
+ * @param target Char array to insert to
+ * @param limit End index of the char array, limit <= target.length
+ * @param offset16 Offset which char32 will be inserted in
+ * @param char32 Codepoint to be inserted
+ * @return new limit size
+ * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
+ * @stable ICU 2.1
+ */
+ public static int insert(char target[], int limit, int offset16, int char32) {
+ String str = valueOf(char32);
+ if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
+ offset16++;
+ }
+ int size = str.length();
+ if (limit + size > target.length) {
+ throw new ArrayIndexOutOfBoundsException(offset16 + size);
+ }
+ System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
+ target[offset16] = str.charAt(0);
+ if (size == 2) {
+ target[offset16 + 1] = str.charAt(1);
+ }
+ return limit + size;
+ }
+
+ /**
+ * Removes the codepoint at the specified position in this target (shortening target by 1
+ * character if the codepoint is a non-supplementary, 2 otherwise).
+ *
+ * @param target String buffer to remove codepoint from
+ * @param offset16 Offset which the codepoint will be removed
+ * @return a reference to target
+ * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
+ * @stable ICU 2.1
+ */
+ public static StringBuffer delete(StringBuffer target, int offset16) {
+ int count = 1;
+ switch (bounds(target, offset16)) {
+ case LEAD_SURROGATE_BOUNDARY:
+ count++;
+ break;
+ case TRAIL_SURROGATE_BOUNDARY:
+ count++;
+ offset16--;
+ break;
+ }
+ target.delete(offset16, offset16 + count);
+ return target;
+ }
+
+ /**
+ * Removes the codepoint at the specified position in this target (shortening target by 1
+ * character if the codepoint is a non-supplementary, 2 otherwise).
+ *
+ * @param target String buffer to remove codepoint from
+ * @param limit End index of the char array, limit <= target.length
+ * @param offset16 Offset which the codepoint will be removed
+ * @return a new limit size
+ * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
+ * @stable ICU 2.1
+ */
+ public static int delete(char target[], int limit, int offset16) {
+ int count = 1;
+ switch (bounds(target, 0, limit, offset16)) {
+ case LEAD_SURROGATE_BOUNDARY:
+ count++;
+ break;
+ case TRAIL_SURROGATE_BOUNDARY:
+ count++;
+ offset16--;
+ break;
+ }
+ System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
+ target[limit - count] = 0;
+ return limit - count;
+ }
+
+ /**
+ * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
+ * the argument codepoint. I.e., the smallest index i such that
+ * UTF16.charAt(source, i) ==
+ * char32 is true.
+ *
+ * If no such character occurs in this string, then -1 is returned. + *
+ *
+ * Examples:
+ * UTF16.indexOf("abc", 'a') returns 0
+ * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1
+ *
+ * If no such string str occurs in this source, then -1 is returned. + *
+ *
+ * Examples:
+ * UTF16.indexOf("abc", "ab") returns 0
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1
+ *
+ * If no such character occurs in this string, then -1 is returned. + *
+ *
+ * Examples:
+ * UTF16.indexOf("abc", 'a', 1) returns -1
+ * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1
+ *
+ * If no such string str occurs in this source, then -1 is returned. + *
+ *
+ * Examples:
+ * UTF16.indexOf("abc", "ab", 0) returns 0
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1
+ *
+ * Examples:
+ * UTF16.lastIndexOf("abc", 'a') returns 0
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1
+ *
+ * source is searched backwards starting at the last character. + *
+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param char32 Codepoint to search for + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, int char32) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); + } + // non-surrogate bmp + if (char32 < LEAD_SURROGATE_MIN_VALUE + || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { + return source.lastIndexOf((char) char32); + } + // surrogate + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + int result = source.lastIndexOf((char) char32); + if (result >= 0) { + if (isLeadSurrogate((char) char32) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + 1))) { + return lastIndexOf(source, char32, result - 1); + } + // trail surrogate + if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, char32, result - 1); + } + } + return result; + } + // supplementary + String char32str = toString(char32); + return source.lastIndexOf(char32str); + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument string str. This method is implemented based on codepoints, hence a "lead + * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str + * starts with trail surrogate character at index 0, a source with a leading a surrogate + * character before str found at in source will not have a valid match. Vice versa for lead + * surrogates that ends str. See example below. + *
+ * Examples:
+ * UTF16.lastIndexOf("abc", "a") returns 0
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1
+ *
+ * source is searched backwards starting at the last character. + *
+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, String str) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.lastIndexOf(str); + } + + int result = source.lastIndexOf(str); + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + strLength + 1))) { + return lastIndexOf(source, str, result - 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, str, result - 1); + } + } + return result; + } + + /** + *+ * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument codepoint, where the result is less than or equals to fromIndex. + *
+ *+ * This method is implemented based on codepoints, hence a single surrogate character will not + * match a supplementary character. + *
+ *+ * source is searched backwards starting at the last character starting at the specified index. + *
+ *
+ * Examples:
+ * UTF16.lastIndexOf("abc", 'c', 2) returns 2
+ * UTF16.lastIndexOf("abc", 'c', 1) returns -1
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1
+ *
+ * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument string str, where the result is less than or equals to fromIndex. + *
+ *+ * This method is implemented based on codepoints, hence a "lead surrogate character + trail + * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate + * character at index 0, a source with a leading a surrogate character before str found at in + * source will not have a valid match. Vice versa for lead surrogates that ends str. + *
+ * See example below. + *
+ * Examples:
+ * UTF16.lastIndexOf("abc", "c", 2) returns 2
+ * UTF16.lastIndexOf("abc", "c", 1) returns -1
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1
+ *
+ * source is searched backwards starting at the last character. + *
+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @param fromIndex the index to start the search from. There is no restriction on the value of + * fromIndex. If it is greater than or equal to the length of this string, it has the + * same effect as if it were equal to one less than the length of this string: this + * entire string may be searched. If it is negative, it has the same effect as if it + * were -1: -1 is returned. + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, String str, int fromIndex) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.lastIndexOf(str, fromIndex); + } + + int result = source.lastIndexOf(str, fromIndex); + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + strLength))) { + return lastIndexOf(source, str, result - 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, str, result - 1); + } + } + return result; + } + + /** + * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of + * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 + * format Unicode string source, then source will be returned. Otherwise, a new String object is + * created that represents a codepoint sequence identical to the codepoint sequence represented + * by source, except that every occurrence of oldChar32 is replaced by an occurrence of + * newChar32. + *
+ * Examples:
+ * UTF16.replace("mesquite in your cellar", 'e', 'o');
+ * returns "mosquito in your collar"
+ * UTF16.replace("JonL", 'q', 'x');
+ * returns "JonL" (no change)
+ * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!');
+ * returns "Supplementary character !"
+ * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!');
+ * returns "Supplementary character \ud800\udc00"
+ *
+ * Examples:
+ * UTF16.replace("mesquite in your cellar", "e", "o");
+ * returns "mosquito in your collar"
+ * UTF16.replace("mesquite in your cellar", "mesquite", "cat");
+ * returns "cat in your cellar"
+ * UTF16.replace("JonL", "q", "x");
+ * returns "JonL" (no change)
+ * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!');
+ * returns "Supplementary character !"
+ * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!');
+ * returns "Supplementary character \ud800\udc00"
+ *
+ * Examples:
+ * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))
+ * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
+ *
+ * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
+ * @return a modified source with reversed UTF16 format Unicode string.
+ * @stable ICU 2.6
+ */
+ public static StringBuffer reverse(StringBuffer source) {
+ int length = source.length();
+ StringBuffer result = new StringBuffer(length);
+ for (int i = length; i-- > 0;) {
+ char ch = source.charAt(i);
+ if (isTrailSurrogate(ch) && i > 0) {
+ char ch2 = source.charAt(i - 1);
+ if (isLeadSurrogate(ch2)) {
+ result.append(ch2);
+ result.append(ch);
+ --i;
+ continue;
+ }
+ }
+ result.append(ch);
+ }
+ return result;
+ }
+
+ /**
+ * Check if the string contains more Unicode code points than a certain number. This is more
+ * efficient than counting all code points in the entire string and comparing that number with a
+ * threshold. This function may not need to scan the string at all if the length is within a
+ * certain range, and never needs to count more than 'number + 1' code points. Logically
+ * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
+ * code units.
+ *
+ * @param source The input string.
+ * @param number The number of code points in the string is compared against the 'number'
+ * parameter.
+ * @return boolean value for whether the string contains more Unicode code points than 'number'.
+ * @stable ICU 2.4
+ */
+ public static boolean hasMoreCodePointsThan(String source, int number) {
+ if (number < 0) {
+ return true;
+ }
+ if (source == null) {
+ return false;
+ }
+ int length = source.length();
+
+ // length >= 0 known
+ // source contains at least (length + 1) / 2 code points: <= 2
+ // chars per cp
+ if (((length + 1) >> 1) > number) {
+ return true;
+ }
+
+ // check if source does not even contain enough chars
+ int maxsupplementary = length - number;
+ if (maxsupplementary <= 0) {
+ return false;
+ }
+
+ // there are maxsupplementary = length - number more chars than
+ // asked-for code points
+
+ // count code points until they exceed and also check that there are
+ // no more than maxsupplementary supplementary code points (char pairs)
+ int start = 0;
+ while (true) {
+ if (length == 0) {
+ return false;
+ }
+ if (number == 0) {
+ return true;
+ }
+ if (isLeadSurrogate(source.charAt(start++)) && start != length
+ && isTrailSurrogate(source.charAt(start))) {
+ start++;
+ if (--maxsupplementary <= 0) {
+ // too many pairs - too few code points
+ return false;
+ }
+ }
+ --number;
+ }
+ }
+
+ /**
+ * Check if the sub-range of char array, from argument start to limit, contains more Unicode
+ * code points than a certain number. This is more efficient than counting all code points in
+ * the entire char array range and comparing that number with a threshold. This function may not
+ * need to scan the char array at all if start and limit is within a certain range, and never
+ * needs to count more than 'number + 1' code points. Logically equivalent to
+ * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
+ * or two code units.
+ *
+ * @param source Array of UTF-16 chars
+ * @param start Offset to substring in the source array for analyzing
+ * @param limit Offset to substring in the source array for analyzing
+ * @param number The number of code points in the string is compared against the 'number'
+ * parameter.
+ * @return boolean value for whether the string contains more Unicode code points than 'number'.
+ * @exception IndexOutOfBoundsException Thrown when limit < start
+ * @stable ICU 2.4
+ */
+ public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
+ int length = limit - start;
+ if (length < 0 || start < 0 || limit < 0) {
+ throw new IndexOutOfBoundsException(
+ "Start and limit indexes should be non-negative and start <= limit");
+ }
+ if (number < 0) {
+ return true;
+ }
+ if (source == null) {
+ return false;
+ }
+
+ // length >= 0 known
+ // source contains at least (length + 1) / 2 code points: <= 2
+ // chars per cp
+ if (((length + 1) >> 1) > number) {
+ return true;
+ }
+
+ // check if source does not even contain enough chars
+ int maxsupplementary = length - number;
+ if (maxsupplementary <= 0) {
+ return false;
+ }
+
+ // there are maxsupplementary = length - number more chars than
+ // asked-for code points
+
+ // count code points until they exceed and also check that there are
+ // no more than maxsupplementary supplementary code points (char pairs)
+ while (true) {
+ if (length == 0) {
+ return false;
+ }
+ if (number == 0) {
+ return true;
+ }
+ if (isLeadSurrogate(source[start++]) && start != limit
+ && isTrailSurrogate(source[start])) {
+ start++;
+ if (--maxsupplementary <= 0) {
+ // too many pairs - too few code points
+ return false;
+ }
+ }
+ --number;
+ }
+ }
+
+ /**
+ * Check if the string buffer contains more Unicode code points than a certain number. This is
+ * more efficient than counting all code points in the entire string buffer and comparing that
+ * number with a threshold. This function may not need to scan the string buffer at all if the
+ * length is within a certain range, and never needs to count more than 'number + 1' code
+ * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
+ * either one or two code units.
+ *
+ * @param source The input string buffer.
+ * @param number The number of code points in the string buffer is compared against the 'number'
+ * parameter.
+ * @return boolean value for whether the string buffer contains more Unicode code points than
+ * 'number'.
+ * @stable ICU 2.4
+ */
+ public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
+ if (number < 0) {
+ return true;
+ }
+ if (source == null) {
+ return false;
+ }
+ int length = source.length();
+
+ // length >= 0 known
+ // source contains at least (length + 1) / 2 code points: <= 2
+ // chars per cp
+ if (((length + 1) >> 1) > number) {
+ return true;
+ }
+
+ // check if source does not even contain enough chars
+ int maxsupplementary = length - number;
+ if (maxsupplementary <= 0) {
+ return false;
+ }
+
+ // there are maxsupplementary = length - number more chars than
+ // asked-for code points
+
+ // count code points until they exceed and also check that there are
+ // no more than maxsupplementary supplementary code points (char pairs)
+ int start = 0;
+ while (true) {
+ if (length == 0) {
+ return false;
+ }
+ if (number == 0) {
+ return true;
+ }
+ if (isLeadSurrogate(source.charAt(start++)) && start != length
+ && isTrailSurrogate(source.charAt(start))) {
+ start++;
+ if (--maxsupplementary <= 0) {
+ // too many pairs - too few code points
+ return false;
+ }
+ }
+ --number;
+ }
+ }
+
+ /**
+ * Cover JDK 1.5 API. Create a String from an array of codePoints.
+ *
+ * @param codePoints The code array
+ * @param offset The start of the text in the code point array
+ * @param count The number of code points
+ * @return a String representing the code points between offset and count
+ * @throws IllegalArgumentException If an invalid code point is encountered
+ * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
+ * @stable ICU 3.0
+ */
+ public static String newString(int[] codePoints, int offset, int count) {
+ if (count < 0) {
+ throw new IllegalArgumentException();
+ }
+ char[] chars = new char[count];
+ int w = 0;
+ for (int r = offset, e = offset + count; r < e; ++r) {
+ int cp = codePoints[r];
+ if (cp < 0 || cp > 0x10ffff) {
+ throw new IllegalArgumentException();
+ }
+ while (true) {
+ try {
+ if (cp < 0x010000) {
+ chars[w] = (char) cp;
+ w++;
+ } else {
+ chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
+ chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
+ w += 2;
+ }
+ break;
+ } catch (IndexOutOfBoundsException ex) {
+ int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
+ / (r - offset + 1)));
+ char[] temp = new char[newlen];
+ System.arraycopy(chars, 0, temp, 0, w);
+ chars = temp;
+ }
+ }
+ }
+ return new String(chars, 0, w);
+ }
+
+ // private data members -------------------------------------------------
+
+ /**
+ * Shift value for lead surrogate to form a supplementary character.
+ */
+ private static final int LEAD_SURROGATE_SHIFT_ = 10;
+
+ /**
+ * Mask to retrieve the significant value from a trail surrogate.
+ */
+ private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
+
+ /**
+ * Value that all lead surrogate starts with
+ */
+ private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
+ - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+
+ // private methods ------------------------------------------------------
+
+ /**
+ *
+ * Converts argument code point and returns a String object representing the code point's value + * in UTF16 format. + *
+ *+ * This method does not check for the validity of the codepoint, the results are not guaranteed + * if a invalid codepoint is passed as argument. + *
+ *+ * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. + *
+ * + * @param ch + * code point + * @return string representation of the code point + */ + private static String toString(int ch) { + if (ch < SUPPLEMENTARY_MIN_VALUE) { + return String.valueOf((char) ch); + } + + StringBuilder result = new StringBuilder(); + result.append(getLeadSurrogate(ch)); + result.append(getTrailSurrogate(ch)); + return result.toString(); + } +} +// eof Property changes on: lucene\src\java\com\ibm\icu\charset\UTF16.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/UConverterAlias.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/UConverterAlias.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/UConverterAlias.java (revision 0) @@ -0,0 +1,769 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2010, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.charset; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; + +final class UConverterAlias { + static final int UNNORMALIZED = 0; + + static final int STD_NORMALIZED = 1; + + static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000; + + static final int CONTAINS_OPTION_BIT = 0x4000; + + static final int CONVERTER_INDEX_MASK = 0xFFF; + + static final int NUM_RESERVED_TAGS = 2; + + static final int NUM_HIDDEN_TAGS = 1; + + static int[] gConverterList = null; + + static int[] gTagList = null; + + static int[] gAliasList = null; + + static int[] gUntaggedConvArray = null; + + static int[] gTaggedAliasArray = null; + + static int[] gTaggedAliasLists = null; + + static int[] gOptionTable = null; + + static byte[] gStringTable = null; + + static byte[] gNormalizedStringTable = null; + + static final String GET_STRING(int idx) { + return new String(gStringTable, 2 * idx, strlen(gStringTable, 2 * idx)); + } + + private static final String GET_NORMALIZED_STRING(int idx) { + return new String(gNormalizedStringTable, 2 * idx, strlen(gNormalizedStringTable, 2 * idx)); + } + + public static final int strlen(byte[] sArray, int sBegin) + { + int i = sBegin; + while(i < sArray.length && sArray[i++] != 0) {} + return i - sBegin - 1; + } + + /*private*/ static final int tocLengthIndex = 0; + + private static final int converterListIndex = 1; + + private static final int tagListIndex = 2; + + private static final int aliasListIndex = 3; + + private static final int untaggedConvArrayIndex = 4; + + private static final int taggedAliasArrayIndex = 5; + + private static final int taggedAliasListsIndex = 6; + + private static final int optionTableIndex = 7; + + private static final int stringTableIndex = 8; + + private static final int normalizedStringTableIndex = 9; + + private static final int minTocLength = 9; /* + * min. tocLength in the file, + * does not count the + * tocLengthIndex! + */ + + private static final int offsetsCount = minTocLength + 1; /* + * length of the + * swapper's + * temporary + * offsets[] + */ + + static ByteBuffer gAliasData = null; + + private static final boolean isAlias(String alias) { + if (alias == null) { + throw new IllegalArgumentException("Alias param is null!"); + } + return (alias.length() != 0); + } + + private static final String CNVALIAS_DATA_FILE_NAME = "/cnvalias.icu"; + + /** + * Default buffer size of datafile + */ + private static final int CNVALIAS_DATA_BUFFER_SIZE = 25000; + + private static final synchronized boolean haveAliasData() + throws IOException{ + return true; + } + + // U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode + // *pErrorCode) +// public static final String io_getConverterName(String alias) +// throws IOException{ +// if (haveAliasData() && isAlias(alias)) { +// boolean[] isAmbigous = new boolean[1]; +// int convNum = findConverter(alias, isAmbigous); +// if (convNum < gConverterList.length) { +// return GET_STRING(gConverterList[(int) convNum]); +// } +// /* else converter not found */ +// } +// return null; +// } + + /* + * search for an alias return the converter number index for gConverterList + */ + // static U_INLINE uint32_t findConverter(const char *alias, UErrorCode + // *pErrorCode) + private static final int findConverter(String alias, boolean[] isAmbigous) { + int mid, start, limit; + int lastMid; + int result; + StringBuilder strippedName = new StringBuilder(); + String aliasToCompare; + + stripForCompare(strippedName, alias); + alias = strippedName.toString(); + + /* do a binary search for the alias */ + start = 0; + limit = gUntaggedConvArray.length; + mid = limit; + lastMid = Integer.MAX_VALUE; + + for (;;) { + mid = (start + limit) / 2; + if (lastMid == mid) { /* Have we moved? */ + break; /* We haven't moved, and it wasn't found. */ + } + lastMid = mid; + aliasToCompare = GET_NORMALIZED_STRING(gAliasList[mid]); + result = alias.compareTo(aliasToCompare); + + if (result < 0) { + limit = mid; + } else if (result > 0) { + start = mid; + } else { + /* + * Since the gencnval tool folds duplicates into one entry, this + * alias in gAliasList is unique, but different standards may + * map an alias to different converters. + */ + if ((gUntaggedConvArray[mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) { + isAmbigous[0]=true; + } + /* State whether the canonical converter name contains an option. + This information is contained in this list in order to maintain backward & forward compatibility. */ + /*if (containsOption) { + UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo; + *containsOption = (UBool)((containsCnvOptionInfo + && ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0)) + || !containsCnvOptionInfo); + }*/ + return gUntaggedConvArray[mid] & CONVERTER_INDEX_MASK; + } + } + return Integer.MAX_VALUE; + } + + /** + * stripForCompare Remove the underscores, dashes and spaces from + * the name, and convert the name to lower case. + * + * @param dst The destination buffer, which is <= the buffer of name. + * @param name The alias to strip + * @return the destination buffer. + */ + public static final StringBuilder stripForCompare(StringBuilder dst, String name) { + return io_stripASCIIForCompare(dst, name); + } + + // enum { + private static final byte IGNORE = 0; + private static final byte ZERO = 1; + private static final byte NONZERO = 2; + static final byte MINLETTER = 3; /* any values from here on are lowercase letter mappings */ + // } + + /* character types for ASCII 00..7F */ + static final byte asciiTypes[] = new byte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0, + 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0, + 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0 + }; + + private static final char GET_CHAR_TYPE(char c) { + return (char)((c < asciiTypes.length) ? asciiTypes[c] : (char)IGNORE); + } + + /** @see UConverterAlias#compareNames */ + private static final StringBuilder io_stripASCIIForCompare(StringBuilder dst, String name) { + int nameIndex = 0; + char type, nextType; + char c1; + boolean afterDigit = false; + + while (nameIndex < name.length()) { + c1 = name.charAt(nameIndex++); + type = GET_CHAR_TYPE(c1); + switch (type) { + case IGNORE: + afterDigit = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit && nameIndex < name.length()) { + nextType = GET_CHAR_TYPE(name.charAt(nameIndex)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit = true; + break; + default: + c1 = type; /* lowercased letter */ + afterDigit = false; + break; + } + dst.append(c1); + } + return dst; + } + + /** + * Do a fuzzy compare of a two converter/alias names. The comparison is + * case-insensitive. It also ignores the characters '-', '_', and ' ' (dash, + * underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8" + * are exactly equivalent. + * + * This is a symmetrical (commutative) operation; order of arguments is + * insignificant. This is an important property for sorting the list (when + * the list is preprocessed into binary form) and for performing binary + * searches on it at run time. + * + * @param name1 + * a converter name or alias, zero-terminated + * @param name2 + * a converter name or alias, zero-terminated + * @return 0 if the names match, or a negative value if the name1 lexically + * precedes name2, or a positive value if the name1 lexically + * follows name2. + * + * @see UConverterAlias#stripForCompare + */ + static int compareNames(String name1, String name2){ + int rc, name1Index = 0, name2Index = 0; + char type, nextType; + char c1 = 0, c2 = 0; + boolean afterDigit1 = false, afterDigit2 = false; + + for (;;) { + while (name1Index < name1.length()) { + c1 = name1.charAt(name1Index++); + type = GET_CHAR_TYPE(c1); + switch (type) { + case IGNORE: + afterDigit1 = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit1 && name1Index < name1.length()) { + nextType = GET_CHAR_TYPE(name1.charAt(name1Index)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit1 = true; + break; + default: + c1 = type; /* lowercased letter */ + afterDigit1 = false; + break; + } + break; /* deliver c1 */ + } + while (name2Index < name2.length()) { + c2 = name2.charAt(name2Index++); + type = GET_CHAR_TYPE(c2); + switch (type) { + case IGNORE: + afterDigit2 = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit2 && name1Index < name1.length()) { + nextType = GET_CHAR_TYPE(name2.charAt(name2Index)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit2 = true; + break; + default: + c2 = type; /* lowercased letter */ + afterDigit2 = false; + break; + } + break; /* deliver c2 */ + } + + /* If we reach the ends of both strings then they match */ + if (name1Index >= name1.length() && name2Index >= name2.length()) { + return 0; + } + + /* Case-insensitive comparison */ + rc = (int)c1 - (int)c2; + if (rc != 0) { + return rc; + } + } + } + + static int io_countAliases(String alias) + throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + int convNum = findConverter(alias, isAmbigous); + if (convNum < gConverterList.length) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArray[(gTagList.length - 1) + * gConverterList.length + convNum]; + + if (listOffset != 0) { + return gTaggedAliasLists[listOffset]; + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return 0; + } + + /** + * Return the number of all aliases (and converter names). + * + * @return the number of all aliases + */ + // U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode); +// static int io_countTotalAliases() throws IOException{ +// if (haveAliasData()) { +// return (int) gAliasList.length; +// } +// return 0; +// } + + // U_CFUNC const char * io_getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + static String io_getAlias(String alias, int n) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + int convNum = findConverter(alias,isAmbigous); + if (convNum < gConverterList.length) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArray[(gTagList.length - 1) + * gConverterList.length + convNum]; + + if (listOffset != 0) { + //int listCount = gTaggedAliasListsArray[listOffset]; + /* +1 to skip listCount */ + int[] currListArray = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + + return GET_STRING(currListArray[currListArrayIndex + n]); + + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return null; + } + + // U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) { +// static int io_countStandards() throws IOException{ +// if (haveAliasData()) { +// return (int) (gTagList.length - NUM_HIDDEN_TAGS); +// } +// return 0; +// } + + // U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode + // *pErrorCode) +// static String getStandard(int n) throws IOException{ +// if (haveAliasData()) { +// return GET_STRING(gTagList[n]); +// } +// return null; +// } + + // U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) + static final String getStandardName(String alias, String standard)throws IOException { + if (haveAliasData() && isAlias(alias)) { + int listOffset = findTaggedAliasListsOffset(alias, standard); + + if (0 < listOffset && listOffset < gTaggedAliasLists.length) { + int[] currListArray = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + if (currListArray[0] != 0) { + return GET_STRING(currListArray[currListArrayIndex]); + } + } + } + return null; + } + + // U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode + // *pErrorCode) + static int countAliases(String alias) throws IOException{ + return io_countAliases(alias); + } + + // U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + static String getAlias(String alias, int n) throws IOException{ + return io_getAlias(alias, n); + } + + // U_CFUNC uint16_t countStandards(void) +// static int countStandards()throws IOException{ +// return io_countStandards(); +// } + + /*returns a single Name from the list, will return NULL if out of bounds + */ + static String getAvailableName (int n){ + try{ + if (0 <= n && n <= 0xffff) { + String name = bld_getAvailableConverter(n); + return name; + } + }catch(IOException ex){ + //throw away exception + } + return null; + } + // U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) { + static String getCanonicalName(String alias, String standard) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + int convNum = findTaggedConverterNum(alias, standard); + + if (convNum < gConverterList.length) { + return GET_STRING(gConverterList[convNum]); + } + } + + return null; + } + static int countAvailable (){ + try{ + return bld_countAvailableConverters(); + }catch(IOException ex){ + //throw away exception + } + return -1; + } + + // U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName, + // const char *standard, UErrorCode *pErrorCode) +/* static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException { + UConverterAliasesEnumeration aliasEnum = null; + if (haveAliasData() && isAlias(convName)) { + int listOffset = findTaggedAliasListsOffset(convName, standard); + + + * When listOffset == 0, we want to acknowledge that the converter + * name and standard are okay, but there is nothing to enumerate. + + if (listOffset < gTaggedAliasLists.length) { + + UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0); + aliasEnum = new UConverterAliasesEnumeration(); + aliasEnum.setContext(context); + } + else converter or tag not found + } + return aliasEnum; + }*/ + + // static uint32_t getTagNumber(const char *tagname) + private static int getTagNumber(String tagName) { + if (gTagList != null) { + int tagNum; + for (tagNum = 0; tagNum < gTagList.length; tagNum++) { + if (tagName.equals(GET_STRING(gTagList[tagNum]))) { + return tagNum; + } + } + } + + return Integer.MAX_VALUE; + } + + // static uint32_t findTaggedAliasListsOffset(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static int findTaggedAliasListsOffset(String alias, String standard) { + int idx; + int listOffset; + int convNum; + int tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS) + && convNum < gConverterList.length) { + listOffset = gTaggedAliasArray[tagNum + * gConverterList.length + convNum]; + if (listOffset != 0 + && gTaggedAliasLists[listOffset + 1] != 0) { + return listOffset; + } + if (isAmbigous[0]==true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search the + * whole swiss cheese starting at the highest standard affinity. + * This may take a while. + */ + + for (idx = 0; idx < gTaggedAliasArray.length; idx++) { + listOffset = gTaggedAliasArray[idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + int currTagNum = idx / gConverterList.length; + int currConvNum = (idx - currTagNum + * gConverterList.length); + int tempListOffset = gTaggedAliasArray[tagNum + * gConverterList.length + currConvNum]; + if (tempListOffset != 0 + && gTaggedAliasLists[tempListOffset + 1] != 0) { + return tempListOffset; + } + /* + * else keep on looking We could speed this up by + * starting on the next row because an alias is unique + * per row, right now. This would change if alias + * versioning appears. + */ + } + } + /* The standard doesn't know about the alias */ + } + /* else no default name */ + return 0; + } + /* else converter or tag not found */ + + return Integer.MAX_VALUE; + } + + /* Return the canonical name */ + // static uint32_t findTaggedConverterNum(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static int findTaggedConverterNum(String alias, String standard) { + int idx; + int listOffset; + int convNum; + int tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS) + && convNum < gConverterList.length) { + listOffset = gTaggedAliasArray[tagNum + * gConverterList.length + convNum]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return convNum; + } + if (isAmbigous[0] == true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search one + * slice of the swiss cheese. We search only in the requested + * tag, not the whole thing. This may take a while. + */ + int convStart = (tagNum) * gConverterList.length; + int convLimit = (tagNum + 1) * gConverterList.length; + for (idx = convStart; idx < convLimit; idx++) { + listOffset = gTaggedAliasArray[idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return idx - convStart; + } + } + /* The standard doesn't know about the alias */ + } + /* else no canonical name */ + } + /* else converter or tag not found */ + + return Integer.MAX_VALUE; + } + + // static U_INLINE UBool isAliasInList(const char *alias, uint32_t + // listOffset) + private static boolean isAliasInList(String alias, int listOffset) { + if (listOffset != 0) { + int currAlias; + int listCount = gTaggedAliasLists[listOffset]; + /* +1 to skip listCount */ + int[] currList = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + for (currAlias = 0; currAlias < listCount; currAlias++) { + if (currList[currAlias + currListArrayIndex] != 0 + && compareNames( + alias, + GET_STRING(currList[currAlias + currListArrayIndex])) == 0) { + return true; + } + } + } + return false; + } + + // begin bld.c + static String[] gAvailableConverters = null; + + static int gAvailableConverterCount = 0; + + static byte[] gDefaultConverterNameBuffer; // [MAX_CONVERTER_NAME_LENGTH + + // 1]; /* +1 for NULL */ + + static String gDefaultConverterName = null; + + // static UBool haveAvailableConverterList(UErrorCode *pErrorCode) + static boolean haveAvailableConverterList() throws IOException{ + if (gAvailableConverters == null) { + int idx; + int localConverterCount; + String converterName; + String[] localConverterList; + + if (!haveAliasData()) { + return false; + } + + /* We can't have more than "*converterTable" converters to open */ + localConverterList = new String[gConverterList.length]; + + localConverterCount = 0; + + for (idx = 0; idx < gConverterList.length; idx++) { + converterName = GET_STRING(gConverterList[idx]); + //UConverter cnv = UConverter.open(converterName); + //TODO: Fix me + localConverterList[localConverterCount++] = converterName; + + } + + // agljport:todo umtx_lock(NULL); + if (gAvailableConverters == null) { + gAvailableConverters = localConverterList; + gAvailableConverterCount = localConverterCount; + /* haveData should have already registered the cleanup function */ + } else { + // agljport:todo free((char **)localConverterList); + } + // agljport:todo umtx_unlock(NULL); + } + return true; + } + + // U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode) + static int bld_countAvailableConverters() throws IOException{ + if (haveAvailableConverterList()) { + return gAvailableConverterCount; + } + return 0; + } + + // U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode + // *pErrorCode) + static String bld_getAvailableConverter(int n) throws IOException{ + if (haveAvailableConverterList()) { + if (n < gAvailableConverterCount) { + return gAvailableConverters[n]; + } + } + return null; + } + + /* default converter name --------------------------------------------------- */ + + /* + * In order to be really thread-safe, the get function would have to take + * a buffer parameter and copy the current string inside a mutex block. + * This implementation only tries to be really thread-safe while + * setting the name. + * It assumes that setting a pointer is atomic. + */ + + // U_CFUNC const char * getDefaultName() +// static final synchronized String getDefaultName() { +// /* local variable to be thread-safe */ +// String name; +// +// //agljport:todo umtx_lock(null); +// name = gDefaultConverterName; +// //agljport:todo umtx_unlock(null); +// +// if (name == null) { +// //UConverter cnv = null; +// int length = 0; +// +// name = CharsetICU.getDefaultCharsetName(); +// +// /* if the name is there, test it out and get the canonical name with options */ +// if (name != null) { +// // cnv = UConverter.open(name); +// // name = cnv.getName(cnv); +// // TODO: fix me +// } +// +// if (name == null || name.length() == 0 ||/* cnv == null ||*/ +// length >= gDefaultConverterNameBuffer.length) { +// /* Panic time, let's use a fallback. */ +// name = new String("US-ASCII"); +// } +// +// //length=(int32_t)(strlen(name)); +// +// /* Copy the name before we close the converter. */ +// name = gDefaultConverterName; +// } +// +// return name; +// } + + //end bld.c +} \ No newline at end of file Property changes on: lucene\src\java\com\ibm\icu\charset\UConverterAlias.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/CharsetCallback.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetCallback.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetCallback.java (revision 0) @@ -0,0 +1,408 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2010, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CoderResult; + +/** + *A subclass of java.nio.Charset for providing implementation of ICU's charset converters. + * This API is used to convert codepage or character encoded data to and + * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that + * converter, you can get its properties, set options, convert your data.
+ * + *Since many software programs recogize different converter names for
+ * different types of converters, there are other functions in this API to
+ * iterate over the converter aliases.
+ *
+ * @stable ICU 3.6
+ */
+public abstract class CharsetICU extends Charset{
+
+ String icuCanonicalName;
+ String javaCanonicalName;
+ int options;
+
+ float maxCharsPerByte;
+
+ String name; /* +4: 60 internal name of the converter- invariant chars */
+
+ int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
+
+ byte platform; /* +68: 1 platform of the converter (only IBM now) */
+ byte conversionType; /* +69: 1 conversion type */
+
+ int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
+ int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
+
+ byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
+ byte subCharLen; /* +76: 1 */
+
+ byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
+ byte hasFromUnicodeFallback; /* +78: 1 */
+ short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
+ byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
+ //byte reserved[/*19*/]; /* +81: 19 to round out the structure */
+
+
+ // typedef enum UConverterUnicodeSet {
+ /**
+ * Parameter that select the set of roundtrippable Unicode code points.
+ * @stable ICU 4.0
+ */
+ public static final int ROUNDTRIP_SET=0;
+ /**
+ * Select the set of Unicode code points with roundtrip or fallback mappings.
+ * Not supported at this point.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
+
+ //} UConverterUnicodeSet;
+
+ /**
+ *
+ * @param icuCanonicalName
+ * @param canonicalName
+ * @param aliases
+ * @stable ICU 3.6
+ */
+ protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
+ super(canonicalName,aliases);
+ if(canonicalName.length() == 0){
+ throw new IllegalCharsetNameException(canonicalName);
+ }
+ this.javaCanonicalName = canonicalName;
+ this.icuCanonicalName = icuCanonicalName;
+ }
+
+ /**
+ * Ascertains if a charset is a sub set of this charset
+ * Implements the abstract method of super class.
+ * @param cs charset to test
+ * @return true if the given charset is a subset of this charset
+ * @stable ICU 3.6
+ */
+ public boolean contains(Charset cs){
+ if (null == cs) {
+ return false;
+ } else if (this.equals(cs)) {
+ return true;
+ }
+ return false;
+ }
+
+ static final boolean isSurrogate(int c){
+ return (((c)&0xfffff800)==0xd800);
+ }
+
+ /*
+ * Returns the default charset name
+ */
+// static final String getDefaultCharsetName(){
+// String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
+// return defaultEncoding;
+// }
+
+ /**
+ * Returns a charset object for the named charset.
+ * This method gurantee that ICU charset is returned when
+ * available. If the ICU charset provider does not support
+ * the specified charset, then try other charset providers
+ * including the standard Java charset provider.
+ *
+ * @param charsetName The name of the requested charset,
+ * may be either a canonical name or an alias
+ * @return A charset object for the named charset
+ * @throws IllegalCharsetNameException If the given charset name
+ * is illegal
+ * @throws UnsupportedCharsetException If no support for the
+ * named charset is available in this instance of th Java
+ * virtual machine
+ * @stable ICU 3.6
+ */
+ public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
+ if (charsetName.equals("BOCU-1"))
+ return new CharsetBOCU1("BOCU-1", "BOCU-1", new String[] { "BOCU1" });
+ return null;
+ }
+}
Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetICU.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java
===================================================================
--- lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java (revision 0)
+++ lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java (revision 0)
@@ -0,0 +1,916 @@
+/**
+ *******************************************************************************
+ * Copyright (C) 2006-2009, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ *
+ *******************************************************************************
+ */
+
+package com.ibm.icu.charset;
+
+import java.nio.BufferOverflowException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.IntBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+
+/**
+ * An abstract class that provides framework methods of decoding operations for concrete
+ * subclasses.
+ * In the future this class will contain API that will implement converter semantics of ICU4C.
+ * @stable ICU 3.6
+ */
+public abstract class CharsetEncoderICU extends CharsetEncoder {
+
+ /* this is used in fromUnicode DBCS tables as an "unassigned" marker */
+ static final char MISSING_CHAR_MARKER = '\uFFFF';
+
+ byte[] errorBuffer = new byte[30];
+
+ int errorBufferLength = 0;
+
+ /** these are for encodeLoopICU */
+ int fromUnicodeStatus;
+
+ int fromUChar32;
+
+ boolean useSubChar1;
+
+ boolean useFallback;
+
+ /* maximum number of indexed UChars */
+ static final int EXT_MAX_UCHARS = 19;
+
+ /* store previous UChars/chars to continue partial matches */
+ int preFromUFirstCP; /* >=0: partial match */
+
+ char[] preFromUArray = new char[EXT_MAX_UCHARS];
+
+ int preFromUBegin;
+
+ int preFromULength; /* negative: replay */
+
+ char[] invalidUCharBuffer = new char[2];
+
+ int invalidUCharLength;
+
+ Object fromUContext;
+
+ private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP;
+
+ private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP;
+
+ CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder() {
+ public CoderResult call(CharsetEncoderICU encoder, Object context,
+ CharBuffer source, ByteBuffer target, IntBuffer offsets,
+ char[] buffer, int length, int cp, CoderResult cr) {
+ if (cr.isUnmappable()) {
+ return onUnmappableInput.call(encoder, context, source, target,
+ offsets, buffer, length, cp, cr);
+ } else /* if (cr.isMalformed()) */ {
+ return onMalformedInput.call(encoder, context, source, target,
+ offsets, buffer, length, cp, cr);
+ }
+ // return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context, source, target, offsets, buffer, length, cp, cr);
+
+ }
+ };
+
+ /*
+ * Construcs a new encoder for the given charset
+ *
+ * @param cs
+ * for which the decoder is created
+ * @param replacement
+ * the substitution bytes
+ */
+ CharsetEncoderICU(CharsetICU cs, byte[] replacement) {
+ super(cs, (cs.minBytesPerChar + cs.maxBytesPerChar) / 2,
+ cs.maxBytesPerChar, replacement);
+ }
+
+ /**
+ * Is this Encoder allowed to use fallbacks? A fallback mapping is a mapping
+ * that will convert a Unicode codepoint sequence to a byte sequence, but
+ * the encoded byte sequence will round trip convert to a different
+ * Unicode codepoint sequence.
+ * @return true if the converter uses fallback, false otherwise.
+ * @stable ICU 3.8
+ */
+ public boolean isFallbackUsed() {
+ return useFallback;
+ }
+
+ /**
+ * Sets whether this Encoder can use fallbacks?
+ * @param usesFallback true if the user wants the converter to take
+ * advantage of the fallback mapping, false otherwise.
+ * @stable ICU 3.8
+ */
+ public void setFallbackUsed(boolean usesFallback) {
+ useFallback = usesFallback;
+ }
+
+ /*
+ * Use fallbacks from Unicode to codepage when useFallback or for private-use code points
+ * @param c A codepoint
+ */
+ final boolean isFromUUseFallback(int c) {
+ return (useFallback)
+ || (Character.getType(c) == Character.PRIVATE_USE);
+ }
+
+ /**
+ * Use fallbacks from Unicode to codepage when useFallback or for private-use code points
+ */
+ static final boolean isFromUUseFallback(boolean iUseFallback, int c) {
+ return (iUseFallback)
+ || (Character.getType(c) == Character.PRIVATE_USE);
+ }
+
+ /**
+ * Sets the action to be taken if an illegal sequence is encountered
+ *
+ * @param newAction
+ * action to be taken
+ * @exception IllegalArgumentException
+ * @stable ICU 3.6
+ */
+ protected void implOnMalformedInput(CodingErrorAction newAction) {
+ onMalformedInput = getCallback(newAction);
+ }
+
+ /**
+ * Sets the action to be taken if an illegal sequence is encountered
+ *
+ * @param newAction
+ * action to be taken
+ * @exception IllegalArgumentException
+ * @stable ICU 3.6
+ */
+ protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
+ onUnmappableInput = getCallback(newAction);
+ }
+
+ /**
+ * Sets the callback encoder method and context to be used if an illegal sequence is encountered.
+ * You would normally call this twice to set both the malform and unmappable error. In this case,
+ * newContext should remain the same since using a different newContext each time will negate the last
+ * one used.
+ * @param err CoderResult
+ * @param newCallback CharsetCallback.Encoder
+ * @param newContext Object
+ * @stable ICU 4.0
+ */
+ public final void setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext) {
+ if (err.isMalformed()) {
+ onMalformedInput = newCallback;
+ } else if (err.isUnmappable()) {
+ onUnmappableInput = newCallback;
+ } else {
+ /* Error: Only malformed and unmappable are handled. */
+ }
+
+ if (fromUContext == null || !fromUContext.equals(newContext)) {
+ setFromUContext(newContext);
+ }
+ }
+
+ /**
+ * Sets fromUContext used in callbacks.
+ *
+ * @param newContext Object
+ * @exception IllegalArgumentException The object is an illegal argument for UContext.
+ * @stable ICU 4.0
+ */
+ public final void setFromUContext(Object newContext) {
+ fromUContext = newContext;
+ }
+
+ private static CharsetCallback.Encoder getCallback(CodingErrorAction action) {
+ if (action == CodingErrorAction.REPLACE) {
+ return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE;
+ } else if (action == CodingErrorAction.IGNORE) {
+ return CharsetCallback.FROM_U_CALLBACK_SKIP;
+ } else /* if (action == CodingErrorAction.REPORT) */ {
+ return CharsetCallback.FROM_U_CALLBACK_STOP;
+ }
+ }
+
+ private static final CharBuffer EMPTY = CharBuffer.allocate(0);
+
+ /**
+ * Flushes any characters saved in the converter's internal buffer and
+ * resets the converter.
+ * @param out action to be taken
+ * @return result of flushing action and completes the decoding all input.
+ * Returns CoderResult.UNDERFLOW if the action succeeds.
+ * @stable ICU 3.6
+ */
+ protected CoderResult implFlush(ByteBuffer out) {
+ return encode(EMPTY, out, null, true);
+ }
+
+ /**
+ * Resets the from Unicode mode of converter
+ * @stable ICU 3.6
+ */
+ protected void implReset() {
+ errorBufferLength = 0;
+ fromUnicodeStatus = 0;
+ fromUChar32 = 0;
+ fromUnicodeReset();
+ }
+
+ private void fromUnicodeReset() {
+ preFromUBegin = 0;
+ preFromUFirstCP = UConverterConstants.U_SENTINEL;
+ preFromULength = 0;
+ }
+
+ /**
+ * Encodes one or more chars. The default behaviour of the
+ * converter is stop and report if an error in input stream is encountered.
+ * To set different behaviour use @see CharsetEncoder.onMalformedInput()
+ * @param in buffer to decode
+ * @param out buffer to populate with decoded result
+ * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
+ * action succeeds or more input is needed for completing the decoding action.
+ * @stable ICU 3.6
+ */
+ protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
+ if (!in.hasRemaining() && this.errorBufferLength == 0) { // make sure the errorBuffer is empty
+ // The Java framework should have already substituted what was left.
+ fromUChar32 = 0;
+ //fromUnicodeReset();
+ return CoderResult.UNDERFLOW;
+ }
+ in.position(in.position() + fromUCountPending());
+ /* do the conversion */
+ CoderResult ret = encode(in, out, null, false);
+ setSourcePosition(in);
+ /* No need to reset to keep the proper state of the encoder.
+ if (ret.isUnderflow() && in.hasRemaining()) {
+ // The Java framework is going to substitute what is left.
+ //fromUnicodeReset();
+ } */
+ return ret;
+ }
+
+ /*
+ * Implements ICU semantics of buffer management
+ * @param source
+ * @param target
+ * @param offsets
+ * @return A CoderResult object that contains the error result when an error occurs.
+ */
+ abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target,
+ IntBuffer offsets, boolean flush);
+
+ /*
+ * Implements ICU semantics for encoding the buffer
+ * @param source The input character buffer
+ * @param target The output byte buffer
+ * @param offsets
+ * @param flush true if, and only if, the invoker can provide no
+ * additional input bytes beyond those in the given buffer.
+ * @return A CoderResult object that contains the error result when an error occurs.
+ */
+ final CoderResult encode(CharBuffer source, ByteBuffer target,
+ IntBuffer offsets, boolean flush) {
+
+ /* check parameters */
+ if (target == null || source == null) {
+ throw new IllegalArgumentException();
+ }
+
+ /*
+ * Make sure that the buffer sizes do not exceed the number range for
+ * int32_t because some functions use the size (in units or bytes)
+ * rather than comparing pointers, and because offsets are int32_t values.
+ *
+ * size_t is guaranteed to be unsigned and large enough for the job.
+ *
+ * Return with an error instead of adjusting the limits because we would
+ * not be able to maintain the semantics that either the source must be
+ * consumed or the target filled (unless an error occurs).
+ * An adjustment would be targetLimit=t+0x7fffffff; for example.
+ */
+
+ /* flush the target overflow buffer */
+ if (errorBufferLength > 0) {
+ byte[] overflowArray;
+ int i, length;
+
+ overflowArray = errorBuffer;
+ length = errorBufferLength;
+ i = 0;
+ do {
+ if (target.remaining() == 0) {
+ /* the overflow buffer contains too much, keep the rest */
+ int j = 0;
+
+ do {
+ overflowArray[j++] = overflowArray[i++];
+ } while (i < length);
+
+ errorBufferLength = (byte) j;
+ return CoderResult.OVERFLOW;
+ }
+
+ /* copy the overflow contents to the target */
+ target.put(overflowArray[i++]);
+ if (offsets != null) {
+ offsets.put(-1); /* no source index available for old output */
+ }
+ } while (i < length);
+
+ /* the overflow buffer is completely copied to the target */
+ errorBufferLength = 0;
+ }
+
+ if (!flush && source.remaining() == 0 && preFromULength >= 0) {
+ /* the overflow buffer is emptied and there is no new input: we are done */
+ return CoderResult.UNDERFLOW;
+ }
+
+ /*
+ * Do not simply return with a buffer overflow error if
+ * !flush && t==targetLimit
+ * because it is possible that the source will not generate any output.
+ * For example, the skip callback may be called;
+ * it does not output anything.
+ */
+
+ return fromUnicodeWithCallback(source, target, offsets, flush);
+
+ }
+
+ /*
+ * Implementation note for m:n conversions
+ *
+ * While collecting source units to find the longest match for m:n conversion,
+ * some source units may need to be stored for a partial match.
+ * When a second buffer does not yield a match on all of the previously stored
+ * source units, then they must be "replayed", i.e., fed back into the converter.
+ *
+ * The code relies on the fact that replaying will not nest -
+ * converting a replay buffer will not result in a replay.
+ * This is because a replay is necessary only after the _continuation_ of a
+ * partial match failed, but a replay buffer is converted as a whole.
+ * It may result in some of its units being stored again for a partial match,
+ * but there will not be a continuation _during_ the replay which could fail.
+ *
+ * It is conceivable that a callback function could call the converter
+ * recursively in a way that causes another replay to be stored, but that
+ * would be an error in the callback function.
+ * Such violations will cause assertion failures in a debug build,
+ * and wrong output, but they will not cause a crash.
+ */
+ final CoderResult fromUnicodeWithCallback(CharBuffer source,
+ ByteBuffer target, IntBuffer offsets, boolean flush) {
+ int sBufferIndex;
+ int sourceIndex;
+ int errorInputLength;
+ boolean converterSawEndOfInput, calledCallback;
+
+ /* variables for m:n conversion */
+ CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS);
+ int replayArrayIndex = 0;
+ CharBuffer realSource;
+ boolean realFlush;
+
+ CoderResult cr = CoderResult.UNDERFLOW;
+
+ /* get the converter implementation function */
+ sourceIndex = 0;
+
+ if (preFromULength >= 0) {
+ /* normal mode */
+ realSource = null;
+ realFlush = false;
+ } else {
+ /*
+ * Previous m:n conversion stored source units from a partial match
+ * and failed to consume all of them.
+ * We need to "replay" them from a temporary buffer and convert them first.
+ */
+ realSource = source;
+ realFlush = flush;
+
+ //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
+ replayArray.put(preFromUArray, 0, -preFromULength);
+ source = replayArray;
+ source.position(replayArrayIndex);
+ source.limit(replayArrayIndex - preFromULength); //preFromULength is negative, see declaration
+ flush = false;
+
+ preFromULength = 0;
+ }
+
+ /*
+ * loop for conversion and error handling
+ *
+ * loop {
+ * convert
+ * loop {
+ * update offsets
+ * handle end of input
+ * handle errors/call callback
+ * }
+ * }
+ */
+ for (;;) {
+ /* convert */
+ cr = encodeLoop(source, target, offsets, flush);
+ /*
+ * set a flag for whether the converter
+ * successfully processed the end of the input
+ *
+ * need not check cnv.preFromULength==0 because a replay (<0) will cause
+ * s
+ * Handles a common situation where a character has been read and it may be
+ * a lead surrogate followed by a trail surrogate. This method can change
+ * the source position and will modify fromUChar32.
+ *
+ * If
+ * Same as
+ * while(i
+ * String src = new String(mySource);
+ * int i,codepoint;
+ * boolean passed = false;
+ * while(i
+ *
+ * @param codepoint Unicode code point as int value
+ * @return true if a character can be converted
+ */
+ /* TODO This is different from Java's canEncode(char) API.
+ * ICU's API should implement getUnicodeSet,
+ * and override canEncode(char) which queries getUnicodeSet.
+ * The getUnicodeSet should return a frozen UnicodeSet or use a fillin parameter, like ICU4C.
+ */
+ /*public boolean canEncode(int codepoint) {
+ return true;
+ }*/
+ /**
+ * Overrides super class method
+ * @stable ICU 3.6
+ */
+ public boolean isLegalReplacement(byte[] repl) {
+ return true;
+ }
+
+ /*
+ * Writes out the specified output bytes to the target byte buffer or to converter internal buffers.
+ * @param cnv
+ * @param bytesArray
+ * @param bytesBegin
+ * @param bytesLength
+ * @param out
+ * @param offsets
+ * @param sourceIndex
+ * @return A CoderResult object that contains the error result when an error occurs.
+ */
+ static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv,
+ byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out,
+ IntBuffer offsets, int sourceIndex) {
+
+ //write bytes
+ int obl = bytesLength;
+ CoderResult cr = CoderResult.UNDERFLOW;
+ int bytesLimit = bytesBegin + bytesLength;
+ try {
+ for (; bytesBegin < bytesLimit;) {
+ out.put(bytesArray[bytesBegin]);
+ bytesBegin++;
+ }
+ // success
+ bytesLength = 0;
+ } catch (BufferOverflowException ex) {
+ cr = CoderResult.OVERFLOW;
+ }
+
+ if (offsets != null) {
+ while (obl > bytesLength) {
+ offsets.put(sourceIndex);
+ --obl;
+ }
+ }
+ //write overflow
+ cnv.errorBufferLength = bytesLimit - bytesBegin;
+ if (cnv.errorBufferLength > 0) {
+ int index = 0;
+ while (bytesBegin < bytesLimit) {
+ cnv.errorBuffer[index++] = bytesArray[bytesBegin++];
+ }
+ cr = CoderResult.OVERFLOW;
+ }
+ return cr;
+ }
+
+ /*
+ * Returns the number of chars held in the converter's internal state
+ * because more input is needed for completing the conversion. This function is
+ * useful for mapping semantics of ICU's converter interface to those of iconv,
+ * and this information is not needed for normal conversion.
+ * @return The number of chars in the state. -1 if an error is encountered.
+ */
+ /*public*/int fromUCountPending() {
+ if (preFromULength > 0) {
+ return UTF16.getCharCount(preFromUFirstCP) + preFromULength;
+ } else if (preFromULength < 0) {
+ return -preFromULength;
+ } else if (fromUChar32 > 0) {
+ return 1;
+ } else if (preFromUFirstCP > 0) {
+ return UTF16.getCharCount(preFromUFirstCP);
+ }
+ return 0;
+ }
+
+ /**
+ *
+ * @param source
+ */
+ private final void setSourcePosition(CharBuffer source) {
+
+ // ok was there input held in the previous invocation of encodeLoop
+ // that resulted in output in this invocation?
+ source.position(source.position() - fromUCountPending());
+ }
+
+ /*
+ * Write the codepage substitution character.
+ * Subclasses to override this method.
+ * For stateful converters, it is typically necessary to handle this
+ * specificially for the converter in order to properly maintain the state.
+ * @param source The input character buffer
+ * @param target The output byte buffer
+ * @param offsets
+ * @return A CoderResult object that contains the error result when an error occurs.
+ */
+ CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source,
+ ByteBuffer target, IntBuffer offsets) {
+ CharsetICU cs = (CharsetICU) encoder.charset();
+ byte[] sub = encoder.replacement();
+ if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) {
+ return CharsetEncoderICU.fromUWriteBytes(encoder,
+ new byte[] { cs.subChar1 }, 0, 1, target, offsets, source
+ .position());
+ } else {
+ return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0,
+ sub.length, target, offsets, source.position());
+ }
+ }
+
+ /*
+ * Write the characters to target.
+ * @param source The input character buffer
+ * @param target The output byte buffer
+ * @param offsets
+ * @return A CoderResult object that contains the error result when an error occurs.
+ */
+ CoderResult cbFromUWriteUChars(CharsetEncoderICU encoder,
+ CharBuffer source, ByteBuffer target, IntBuffer offsets) {
+ CoderResult cr = CoderResult.UNDERFLOW;
+
+ /* This is a fun one. Recursion can occur - we're basically going to
+ * just retry shoving data through the same converter. Note, if you got
+ * here through some kind of invalid sequence, you maybe should emit a
+ * reset sequence of some kind. Since this IS an actual conversion,
+ * take care that you've changed the callback or the data, or you'll
+ * get an infinite loop.
+ */
+
+ int oldTargetPosition = target.position();
+ int offsetIndex = source.position();
+
+ cr = encoder.encode(source, target, null, false); /* no offsets and no flush */
+
+ if (offsets != null) {
+ while (target.position() != oldTargetPosition) {
+ offsets.put(offsetIndex);
+ oldTargetPosition++;
+ }
+ }
+
+ /* Note, if you did something like used a stop subcallback, things would get interesting.
+ * In fact, here's where we want to return the partially consumed in-source!
+ */
+ if (cr.isOverflow()) {
+ /* Overflowed target. Now, we'll write into the charErrorBuffer.
+ * It's a fixed size. If we overflow it...Hm
+ */
+
+ /* start the new target at the first free slot in the error buffer */
+ int errBuffLen = encoder.errorBufferLength;
+ ByteBuffer newTarget = ByteBuffer.wrap(encoder.errorBuffer);
+ newTarget.position(errBuffLen); /* set the position at the end of the error buffer */
+ encoder.errorBufferLength = 0;
+
+ encoder.encode(source, newTarget, null, false);
+
+ encoder.errorBuffer = newTarget.array();
+ encoder.errorBufferLength = newTarget.position();
+ }
+
+ return cr;
+ }
+
+ /**
+ * null is returned, then there was success in reading a
+ * surrogate pair, the codepoint is stored in fromUChar32 and
+ * fromUChar32 should be reset (to 0) after being read.
+ * CoderResult.malformedForLength(1) or
+ * CoderResult.UNDERFLOW if there is a problem, or
+ * null if there isn't.
+ * @see #handleSurrogates(CharBuffer, char)
+ * @see #handleSurrogates(CharBuffer, int, char)
+ * @see #handleSurrogates(char[], int, int, char)
+ */
+ final CoderResult handleSurrogates(CharBuffer source, char lead) {
+ if (!Character.isHighSurrogate(lead)) {
+ fromUChar32 = lead;
+ return CoderResult.malformedForLength(1);
+ }
+
+ if (!source.hasRemaining()) {
+ fromUChar32 = lead;
+ return CoderResult.UNDERFLOW;
+ }
+
+ char trail = source.get();
+
+ if (!Character.isLowSurrogate(trail)) {
+ fromUChar32 = lead;
+ source.position(source.position() - 1);
+ return CoderResult.malformedForLength(1);
+ }
+
+ fromUChar32 = Character.toCodePoint(lead, trail);
+ return null;
+ }
+
+ /**
+ * handleSurrogates(CharBuffer, char), but with arrays. As an added
+ * requirement, the calling method must also increment the index if this method returns
+ * null.
+ * CoderResult.malformedForLength(1) or
+ * CoderResult.UNDERFLOW if there is a problem, or null if
+ * there isn't.
+ * @see #handleSurrogates(CharBuffer, char)
+ * @see #handleSurrogates(CharBuffer, int, char)
+ * @see #handleSurrogates(char[], int, int, char)
+ */
+ final CoderResult handleSurrogates(char[] sourceArray, int sourceIndex,
+ int sourceLimit, char lead) {
+ if (!Character.isHighSurrogate(lead)) {
+ fromUChar32 = lead;
+ return CoderResult.malformedForLength(1);
+ }
+
+ if (sourceIndex >= sourceLimit) {
+ fromUChar32 = lead;
+ return CoderResult.UNDERFLOW;
+ }
+
+ char trail = sourceArray[sourceIndex];
+
+ if (!Character.isLowSurrogate(trail)) {
+ fromUChar32 = lead;
+ return CoderResult.malformedForLength(1);
+ }
+
+ fromUChar32 = Character.toCodePoint(lead, trail);
+ return null;
+ }
+}
Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetEncoderICU.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java
===================================================================
--- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (revision 966583)
+++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (working copy)
@@ -107,7 +107,7 @@
while(text != null) {
if (text != null && text.startsWith(prefixRef)) {
- String textString = text.utf8ToString();
+ String textString = text.bocu1ToString();
matcher.reset(textString.substring(prefixLength));
if (matcher.matches()) {
mtv.visitMatchingTerm(new Term(fieldName, textString));
Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java
===================================================================
--- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (revision 966583)
+++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (working copy)
@@ -68,7 +68,7 @@
mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
if (termsEnum.term().startsWith(prefixRef)) {
- mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
+ mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().bocu1ToString()));
} else {
skip = true;
}
@@ -81,7 +81,7 @@
while(true) {
BytesRef text = termsEnum.next();
if (text != null && text.startsWith(prefixRef)) {
- mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
+ mtv.visitMatchingTerm(new Term(fieldName, text.bocu1ToString()));
} else {
break;
}
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java (revision 966583)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java (working copy)
@@ -122,7 +122,7 @@
if (deleting) { // delete rest of stale docs
BytesRef text;
while ((text=uidIter.next()) != null) {
- String termText = text.utf8ToString();
+ String termText = text.bocu1ToString();
System.out.println("deleting " +
HTMLDocument.uid2url(termText));
reader.deleteDocuments(new Term("uid", termText));
@@ -153,7 +153,7 @@
BytesRef text;
while((text = uidIter.next()) != null) {
- String termText = text.utf8ToString();
+ String termText = text.bocu1ToString();
if (termText.compareTo(uid) < 0) {
if (deleting) { // delete stale docs
System.out.println("deleting " +
@@ -165,7 +165,7 @@
}
}
if (text != null &&
- text.utf8ToString().compareTo(uid) == 0) {
+ text.bocu1ToString().compareTo(uid) == 0) {
uidIter.next(); // keep matching docs
} else if (!deleting) { // add new docs
Document doc = HTMLDocument.Document(file);
Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
===================================================================
--- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 966583)
+++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy)
@@ -426,7 +426,7 @@
@Override
public Comparator