Index: src/java/org/apache/lucene/util/BOCUUtil.java =================================================================== --- src/java/org/apache/lucene/util/BOCUUtil.java (revision 0) +++ src/java/org/apache/lucene/util/BOCUUtil.java (revision 0) @@ -0,0 +1,189 @@ +package org.apache.lucene.util; + +public final class BOCUUtil { + /* bounding byte values for differences */ + private static final int BOCU1_MIN = 0x21; + private static final int BOCU1_MIDDLE = 0x90; + // private static final int BOCU1_MAX_LEAD = 0xfe; + private static final int BOCU1_MAX_TRAIL = 0xff; + + /* adjust trail byte counts for the use of some C0 control byte values */ + private static final int BOCU1_TRAIL_CONTROLS_COUNT = 20; + private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN - BOCU1_TRAIL_CONTROLS_COUNT); + + /* number of trail bytes */ + private static final int BOCU1_TRAIL_COUNT = ((BOCU1_MAX_TRAIL - BOCU1_MIN + 1) + BOCU1_TRAIL_CONTROLS_COUNT); + + /* number of positive and negative single-byte codes (counting 0==BOCU1_MIDDLE among the positive ones) */ + private static final int BOCU1_SINGLE = 64; + + /* number of lead bytes for positive and negative 2/3/4-byte sequences */ + private static final int BOCU1_LEAD_2 = 43; + private static final int BOCU1_LEAD_3 = 3; + // private static final int BOCU1_LEAD_4 = 1; + + /* The difference value range for single-byters. */ + private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE - 1); + private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE); + + /* The difference value range for double-byters. */ + private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1 + BOCU1_LEAD_2 * BOCU1_TRAIL_COUNT); + private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1 - BOCU1_LEAD_2 * BOCU1_TRAIL_COUNT); + + /* The difference value range for 3-byters. */ + private static final int BOCU1_REACH_POS_3 = (BOCU1_REACH_POS_2 + BOCU1_LEAD_3 * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT); + private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2 - BOCU1_LEAD_3 * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT); + + /* The lead byte start values. */ + private static final int BOCU1_START_POS_2 = (BOCU1_MIDDLE + BOCU1_REACH_POS_1 + 1); + private static final int BOCU1_START_POS_3 = (BOCU1_START_POS_2 + BOCU1_LEAD_2); + private static final int BOCU1_START_POS_4 = (BOCU1_START_POS_3 + BOCU1_LEAD_3); + + private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE + BOCU1_REACH_NEG_1); + private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2 - BOCU1_LEAD_2); + private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3 - BOCU1_LEAD_3); + + /* + * Byte value map for control codes, from trail byte values 0..19 (0..0x13) as + * used in the difference calculation to external byte values 0x00..0x20. + */ + private static final int[] bocu1TrailToByte = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1c, 0x1d, 0x1e, 0x1f + }; + + /* + * 12 commonly used C0 control codes (and space) are only used to encode + * themselves directly, which makes BOCU-1 MIME-usable and reasonably safe for + * ASCII-oriented software. + * + * These controls are NUL, BEL, BS,TAB, LF, VT, FF, CR, SO, SI, SUB, ESC + * + * The other 20 C0 controls are also encoded directly (to preserve order) but + * are also used as trail bytes in difference encoding (for better compression). + */ + private static int BOCU1_TRAIL_TO_BYTE(int trail) { + return (trail >= BOCU1_TRAIL_CONTROLS_COUNT ? trail + BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]); + } + + public static void UTF16toBOCU1(char[] source, int offset, int length, BytesRef result) { + int upto = 0; + int hash = 0; + final int end = offset + length; + byte[] out = result.bytes; + // Pre-allocate for worst case 4-for-1 + final int maxLen = length * 4; + if (out.length < maxLen) out = result.bytes = new byte[maxLen]; + result.offset = 0; + + int prev = 0x40; + while (offset < end) { + int cp = source[offset++]; + if (0xD800 <= cp && cp <= 0xDBFF) { + int utf32 = (int) source[offset]; + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + cp = ((cp - 0xD7C0) << 10) + (utf32 & 0x3FF); + offset++; + } else { + cp = 0xFFFD; // replacement + } + } + + int diff = cp - prev; + + if (cp <= 0x20) { + if (cp != 0x20) prev = 0x40; // keep the bocu state for space + } else if (0x3040 <= cp && cp <= 0x309f) { + prev = 0x3070; + } else if (0x4e00 <= cp && cp <= 0x9fa5) { + prev = 0x4e00 - BOCU1_REACH_NEG_2; + } else if (0xac00 <= cp && cp <= 0xd7a3) { + prev = (0xd7a3 + 0xac00) / 2; + } else { + prev = (cp & ~0x7f) + 0x40; + } + + if (cp <= 0x20) { + hash = 31*hash + (out[upto++] = (byte) cp); + } else if (diff >= BOCU1_REACH_NEG_1 && diff <= BOCU1_REACH_POS_1) { + hash = 31*hash + (out[upto++] = (byte) (BOCU1_MIDDLE + diff)); + } else { + + final int lead, count; + + if (diff >= BOCU1_REACH_NEG_1) { + if (diff <= BOCU1_REACH_POS_2) { + /* two bytes -- inline the for loop below for + * better perf */ + diff -= BOCU1_REACH_POS_1 + 1; + int m = diff % BOCU1_TRAIL_COUNT; + diff /= BOCU1_TRAIL_COUNT; + if (m < 0) { + --diff; + m += BOCU1_TRAIL_COUNT; + } + out[upto+1] = (byte) (BOCU1_TRAIL_TO_BYTE(m)); + out[upto] = (byte) (BOCU1_START_POS_2 + diff); + upto += 2; + continue; + } else if (diff <= BOCU1_REACH_POS_3) { + /* three bytes */ + diff -= BOCU1_REACH_POS_2 + 1; + lead = BOCU1_START_POS_3; + count = 2; + } else { + /* four bytes */ + diff -= BOCU1_REACH_POS_3 + 1; + lead = BOCU1_START_POS_4; + count = 3; + } + } else { + /* two- and four-byte negative differences */ + if (diff >= BOCU1_REACH_NEG_2) { + /* two bytes -- inline the for loop below for + * better perf */ + diff -= BOCU1_REACH_NEG_1; + int m = diff % BOCU1_TRAIL_COUNT; + diff /= BOCU1_TRAIL_COUNT; + if (m < 0) { + --diff; + m += BOCU1_TRAIL_COUNT; + } + out[upto+1] = (byte) (BOCU1_TRAIL_TO_BYTE(m)); + out[upto] = (byte) (BOCU1_START_NEG_2 + diff); + upto += 2; + continue; + } else if (diff >= BOCU1_REACH_NEG_3) { + /* three bytes */ + diff -= BOCU1_REACH_NEG_2; + lead = BOCU1_START_NEG_3; + count = 2; + } else { + /* four bytes */ + diff -= BOCU1_REACH_NEG_3; + lead = BOCU1_START_NEG_4; + count = 3; + } + } + + /* calculate trail bytes like digits in itoa() */ + for(int i=count;i>0;i--) { + int m = diff % BOCU1_TRAIL_COUNT; + diff /= BOCU1_TRAIL_COUNT; + if (m < 0) { + --diff; + m += BOCU1_TRAIL_COUNT; + } + out[upto+i] = (byte) (BOCU1_TRAIL_TO_BYTE(m)); + } + + out[upto] = (byte) (lead + diff); + upto += 1+count; + } + } + + result.length = upto; + } +}