Index: solr/src/test/org/apache/solr/request/TestFaceting.java =================================================================== --- solr/src/test/org/apache/solr/request/TestFaceting.java (revision 966583) +++ solr/src/test/org/apache/solr/request/TestFaceting.java (working copy) @@ -87,7 +87,7 @@ assertEquals(br != null, rnum < size); if (rnum < size) { assertEquals(rnum, te.pos); - assertEquals(s, te.term().utf8ToString()); + assertEquals(s, te.term().bocu1ToString()); } else { assertEquals(null, te.term()); assertEquals(size, te.getTermNumber()); @@ -98,7 +98,7 @@ assertEquals(size>0, te.skipTo(new BytesRef("000")) != null); assertEquals(0, te.getTermNumber()); if (size>0) { - assertEquals(t(0), te.term().utf8ToString()); + assertEquals(t(0), te.term().bocu1ToString()); } else { assertEquals(null, te.term()); } @@ -111,7 +111,7 @@ BytesRef br = te.skipTo(rnum); assertNotNull(br); assertEquals(rnum, te.pos); - assertEquals(s, te.term().utf8ToString()); + assertEquals(s, te.term().bocu1ToString()); } } } Index: solr/src/java/org/apache/solr/schema/TrieDateField.java =================================================================== --- solr/src/java/org/apache/solr/schema/TrieDateField.java (revision 966583) +++ solr/src/java/org/apache/solr/schema/TrieDateField.java (working copy) @@ -131,7 +131,7 @@ // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); NumericUtils.longToPrefixCoded(super.parseMath(null, val).getTime(), 0, bytes); - return bytes.utf8ToString(); + return bytes.bocu1ToString(); } @Override Index: solr/src/java/org/apache/solr/schema/TrieField.java =================================================================== --- solr/src/java/org/apache/solr/schema/TrieField.java (revision 966583) +++ solr/src/java/org/apache/solr/schema/TrieField.java (working copy) @@ -328,7 +328,7 @@ // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); readableToIndexed(val, bytes); - return bytes.utf8ToString(); + return bytes.bocu1ToString(); } @Override Index: solr/src/java/org/apache/solr/schema/StrField.java =================================================================== --- solr/src/java/org/apache/solr/schema/StrField.java (revision 966583) +++ solr/src/java/org/apache/solr/schema/StrField.java (working copy) @@ -95,7 +95,7 @@ if (ord == 0) { return null; } else { - return termsIndex.lookup(ord, new BytesRef()).utf8ToString(); + return termsIndex.lookup(ord, new BytesRef()).bocu1ToString(); } } Index: solr/src/java/org/apache/solr/schema/FieldType.java =================================================================== --- solr/src/java/org/apache/solr/schema/FieldType.java (revision 966583) +++ solr/src/java/org/apache/solr/schema/FieldType.java (working copy) @@ -377,7 +377,7 @@ /** Given the readable value, return the term value that will match it. */ public void readableToIndexed(CharSequence val, BytesRef result) { String internal = readableToIndexed(val.toString()); - UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result); + UnicodeUtil.UTF16toBOCU1(internal, 0, internal.length(), result); } /** Index: solr/src/java/org/apache/solr/search/ValueSourceParser.java =================================================================== --- solr/src/java/org/apache/solr/search/ValueSourceParser.java (revision 966583) +++ solr/src/java/org/apache/solr/search/ValueSourceParser.java (working copy) @@ -522,7 +522,7 @@ tinfo.indexedField = term.field(); indexedVal = term.text(); } - UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes); + UnicodeUtil.UTF16toBOCU1(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes); } else { ft.readableToIndexed(tinfo.val, tinfo.indexedBytes); } Index: solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java =================================================================== --- solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java (revision 966583) +++ solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java (working copy) @@ -350,7 +350,7 @@ // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). - queue.add(new SimpleFacets.CountPair(term.utf8ToString(), count)); + queue.add(new SimpleFacets.CountPair(term.bocu1ToString(), count)); if (queue.size()>=maxsize) min=queue.last().val; } return false; @@ -397,7 +397,7 @@ } if (limit > 0) { - res.add(term.utf8ToString(), count); + res.add(term.bocu1ToString(), count); limit--; } Index: solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java =================================================================== --- solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java (revision 966583) +++ solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java (working copy) @@ -265,7 +265,7 @@ if( v != null ) { SimpleOrderedMap tfv = new SimpleOrderedMap(); for( int i=0; i= 0 && arrIdx < nTerms) { final BytesRef br = si.lookup(term, tempBR); - String key = ft.indexedToReadable(br == null ? null : br.utf8ToString()); + String key = ft.indexedToReadable(br == null ? null : br.bocu1ToString()); StatsValues stats = facetStatsValues.get(key); if (stats == null) { stats = new StatsValues(); @@ -114,7 +114,7 @@ int arrIdx = term - startTermIndex; if (arrIdx >= 0 && arrIdx < nTerms) { final BytesRef br = si.lookup(term, tempBR); - String key = br == null ? null : br.utf8ToString(); + String key = br == null ? null : br.bocu1ToString(); HashMap statsTermCounts = facetStatsTerms.get(statsTermNum); Integer statsTermCount = statsTermCounts.get(key); if (statsTermCount == null) { Index: solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java =================================================================== --- solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java (revision 966583) +++ solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java (working copy) @@ -294,7 +294,7 @@ public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { NamedList termInfo = new NamedList(); - fieldNL.add(term.utf8ToString(), termInfo); + fieldNL.add(term.bocu1ToString(), termInfo); if (fieldOptions.termFreq == true) { termInfo.add("tf", frequency); } Index: solr/src/java/org/apache/solr/handler/component/QueryComponent.java =================================================================== --- solr/src/java/org/apache/solr/handler/component/QueryComponent.java (revision 966583) +++ solr/src/java/org/apache/solr/handler/component/QueryComponent.java (working copy) @@ -263,7 +263,7 @@ // String field in Lucene, which returns the terms // data as BytesRef: if (val instanceof BytesRef) { - field.setValue(((BytesRef)val).utf8ToString()); + field.setValue(((BytesRef)val).bocu1ToString()); val = ft.toObject(field); } Index: solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java =================================================================== --- solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (revision 966583) +++ solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (working copy) @@ -168,7 +168,7 @@ if (bytesAtt != null) { bytesAtt.toBytesRef(bytes); // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! - token.setEmpty().append(bytes.utf8ToString()); + token.setEmpty().append(bytes.bocu1ToString()); } token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); Index: solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java =================================================================== --- solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java (revision 966583) +++ solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java (working copy) @@ -155,7 +155,7 @@ if (bytesAtt != null) { bytesAtt.toBytesRef(bytes); // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! - token.add("value", bytes.utf8ToString()); + token.add("value", bytes.bocu1ToString()); } token.add("start", offsetAtt.startOffset()); token.add("end", offsetAtt.endOffset()); Index: solr/src/java/org/apache/solr/util/NumberUtils.java =================================================================== --- solr/src/java/org/apache/solr/util/NumberUtils.java (revision 966583) +++ solr/src/java/org/apache/solr/util/NumberUtils.java (working copy) @@ -45,7 +45,7 @@ public static String SortableStr2int(BytesRef val) { // TODO: operate directly on BytesRef - return SortableStr2int(val.utf8ToString()); + return SortableStr2int(val.bocu1ToString()); } @@ -66,7 +66,7 @@ public static String SortableStr2long(BytesRef val) { // TODO: operate directly on BytesRef - return SortableStr2long(val.utf8ToString()); + return SortableStr2long(val.bocu1ToString()); } // @@ -99,7 +99,7 @@ public static float SortableStr2float(BytesRef val) { // TODO: operate directly on BytesRef - return SortableStr2float(val.utf8ToString()); + return SortableStr2float(val.bocu1ToString()); } public static String SortableStr2floatStr(String val) { @@ -125,7 +125,7 @@ public static double SortableStr2double(BytesRef val) { // TODO: operate directly on BytesRef - return SortableStr2double(val.utf8ToString()); + return SortableStr2double(val.bocu1ToString()); } public static String SortableStr2doubleStr(String val) { @@ -155,7 +155,7 @@ public static int SortableStr2int(BytesRef sval, int offset, int len) { // TODO: operate directly on BytesRef - return SortableStr2int(sval.utf8ToString(), offset, len); + return SortableStr2int(sval.bocu1ToString(), offset, len); } // uses binary representation of an int to build a string of @@ -184,6 +184,6 @@ public static long SortableStr2long(BytesRef sval, int offset, int len) { // TODO: operate directly on BytesRef - return SortableStr2long(sval.utf8ToString(), offset, len); + return SortableStr2long(sval.bocu1ToString(), offset, len); } } Index: solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java =================================================================== --- solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java (revision 966583) +++ solr/src/java/org/apache/solr/util/HighFrequencyDictionary.java (working copy) @@ -81,7 +81,7 @@ } hasNextCalled = false; - return (actualTerm != null) ? actualTerm.utf8ToString() : null; + return (actualTerm != null) ? actualTerm.bocu1ToString() : null; } public boolean hasNext() { Index: solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java =================================================================== --- solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java (revision 966583) +++ solr/src/java/org/apache/solr/response/PHPSerializedResponseWriter.java (working copy) @@ -318,7 +318,7 @@ } } } else { - UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8); + UnicodeUtil.UTF16toBOCU1(val, 0, val.length(), utf8); nBytes = utf8.length; } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (revision 966583) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (working copy) @@ -149,7 +149,7 @@ BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { - stopWords.add(text.utf8ToString()); + stopWords.add(text.bocu1ToString()); } } } Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 966583) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -170,7 +170,7 @@ @Override public PostingsConsumer startTerm(BytesRef text) { - final String term = text.utf8ToString(); + final String term = text.bocu1ToString(); current = new RAMTerm(term); postingsWriter.reset(current); return postingsWriter; @@ -179,7 +179,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } @Override @@ -263,7 +263,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } @Override @@ -285,7 +285,7 @@ @Override public SeekStatus seek(BytesRef term, boolean useCache) { - current = term.utf8ToString(); + current = term.bocu1ToString(); it = null; if (ramField.termToDocs.containsKey(current)) { return SeekStatus.FOUND; @@ -676,7 +676,7 @@ if (lastBytesRef == null) { lastBytesRef = new BytesRef(t); } else { - assertTrue("terms in wrong order last=" + lastBytesRef.utf8ToString() + " current=" + t.utf8ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0); + assertTrue("terms in wrong order last=" + lastBytesRef.bocu1ToString() + " current=" + t.bocu1ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0); lastBytesRef.copy(t); } } Index: lucene/src/test/org/apache/lucene/search/TestTermVectors.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestTermVectors.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestTermVectors.java (working copy) @@ -136,9 +136,9 @@ assertEquals(expectedFields[i], posVec.getField()); BytesRef[] terms = posVec.getTerms(); assertEquals(3, terms.length); - assertEquals("content", terms[0].utf8ToString()); - assertEquals("here", terms[1].utf8ToString()); - assertEquals("some", terms[2].utf8ToString()); + assertEquals("content", terms[0].bocu1ToString()); + assertEquals("here", terms[1].bocu1ToString()); + assertEquals("some", terms[2].bocu1ToString()); for(int j=0;j<3;j++) { int[] positions = posVec.getTermPositions(j); assertEquals(1, positions.length); @@ -263,7 +263,7 @@ while(fields.next() != null) { TermsEnum terms = fields.terms(); while(terms.next() != null) { - String text = terms.term().utf8ToString(); + String text = terms.term().bocu1ToString(); docs = terms.docs(MultiFields.getDeletedDocs(knownSearcher.reader), docs); while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { @@ -283,7 +283,7 @@ int [] freqs = vector.getTermFrequencies(); for (int i = 0; i < vTerms.length; i++) { - if (text.equals(vTerms[i].utf8ToString())) + if (text.equals(vTerms[i].bocu1ToString())) { assertTrue(freqs[i] == freq); } @@ -312,7 +312,7 @@ int [] freqs = vector.getTermFrequencies(); assertTrue(terms != null && terms.length == 10); for (int i = 0; i < terms.length; i++) { - String term = terms[i].utf8ToString(); + String term = terms[i].bocu1ToString(); //System.out.println("Term: " + term); int freq = freqs[i]; assertTrue(test4.indexOf(term) != -1); @@ -329,7 +329,7 @@ if (tve != null && last != null) { assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency()); - Integer expectedFreq = test4Map.get(tve.getTerm().utf8ToString()); + Integer expectedFreq = test4Map.get(tve.getTerm().bocu1ToString()); //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue()); } @@ -426,7 +426,7 @@ assertTrue(tfv.getField().equals("field")); BytesRef[] terms = tfv.getTerms(); assertEquals(1, terms.length); - assertEquals(terms[0].utf8ToString(), "one"); + assertEquals(terms[0].bocu1ToString(), "one"); assertEquals(5, tfv.getTermFrequencies()[0]); int[] positions = tfv.getTermPositions(0); Index: lucene/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java (working copy) @@ -87,7 +87,7 @@ TermsEnum te = MultiFields.getFields(reader).terms("body").iterator(); te.seek(new BytesRef(prefix)); do { - String s = te.term().utf8ToString(); + String s = te.term().bocu1ToString(); if (s.startsWith(prefix)) { termsWithPrefix.add(new Term("body", s)); } else { Index: lucene/src/test/org/apache/lucene/search/TestFieldCache.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFieldCache.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestFieldCache.java (working copy) @@ -167,7 +167,7 @@ final BytesRef br = new BytesRef(); for (int i = 0; i < NUM_DOCS; i++) { final BytesRef term = termsIndex.getTerm(i, br); - final String s = term == null ? null : term.utf8ToString(); + final String s = term == null ? null : term.bocu1ToString(); assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s)); } @@ -193,7 +193,7 @@ assertTrue("doubles Size: " + terms.size() + " is not: " + NUM_DOCS, terms.size() == NUM_DOCS); for (int i = 0; i < NUM_DOCS; i++) { final BytesRef term = terms.getTerm(i, br); - final String s = term == null ? null : term.utf8ToString(); + final String s = term == null ? null : term.bocu1ToString(); assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s)); } Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (working copy) @@ -108,7 +108,7 @@ @Override protected AcceptStatus accept(BytesRef term) throws IOException { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + UnicodeUtil.BOCU1toUTF16(term.bytes, term.offset, term.length, utf16); return runAutomaton.run(utf16.result, 0, utf16.length) ? AcceptStatus.YES : AcceptStatus.NO; } Index: lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestMultiThreadTermVectors.java (working copy) @@ -180,7 +180,7 @@ for (int i = 0; i < vectors.length; i++) { terms = vectors[i].getTerms(); for (int z = 0; z < terms.length; z++) { - temp.append(terms[z].utf8ToString()); + temp.append(terms[z].bocu1ToString()); } } Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy) @@ -344,7 +344,7 @@ NumericUtils.intToPrefixCoded(lower, 0, lowerBytes); NumericUtils.intToPrefixCoded(upper, 0, upperBytes); // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! - final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + final String lowerString = lowerBytes.bocu1ToString(), upperString = upperBytes.bocu1ToString(); // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, true); Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (working copy) @@ -363,7 +363,7 @@ NumericUtils.longToPrefixCoded(lower, 0, lowerBytes); NumericUtils.longToPrefixCoded(upper, 0, upperBytes); // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string! - final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString(); + final String lowerString = lowerBytes.bocu1ToString(), upperString = upperBytes.bocu1ToString(); // test inclusive range NumericRangeQuery tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true); Index: lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 966583) +++ lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy) @@ -73,7 +73,7 @@ TermsEnum te = MultiFields.getFields(reader).terms("body").iterator(); te.seek(new BytesRef(prefix)); do { - String s = te.term().utf8ToString(); + String s = te.term().bocu1ToString(); if (s.startsWith(prefix)) { termsWithPrefix.add(new Term("body", s)); } else { @@ -100,8 +100,8 @@ te.seek(new BytesRef(prefix)); do { - if (te.term().utf8ToString().startsWith(prefix)) { - termsWithPrefix.add(new Term("body", te.term().utf8ToString())); + if (te.term().bocu1ToString().startsWith(prefix)) { + termsWithPrefix.add(new Term("body", te.term().bocu1ToString())); } } while (te.next() != null); Index: lucene/src/test/org/apache/lucene/index/TestParallelTermEnum.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestParallelTermEnum.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestParallelTermEnum.java (working copy) @@ -89,31 +89,31 @@ TermsEnum te = fe.terms(); - assertEquals("brown", te.next().utf8ToString()); + assertEquals("brown", te.next().bocu1ToString()); DocsEnum td = te.docs(delDocs, null); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("fox", te.next().utf8ToString()); + assertEquals("fox", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("jumps", te.next().utf8ToString()); + assertEquals("jumps", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("quick", te.next().utf8ToString()); + assertEquals("quick", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("the", te.next().utf8ToString()); + assertEquals("the", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); @@ -124,31 +124,31 @@ assertEquals("field2", f); te = fe.terms(); - assertEquals("brown", te.next().utf8ToString()); + assertEquals("brown", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("fox", te.next().utf8ToString()); + assertEquals("fox", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("jumps", te.next().utf8ToString()); + assertEquals("jumps", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("quick", te.next().utf8ToString()); + assertEquals("quick", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("the", te.next().utf8ToString()); + assertEquals("the", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); @@ -159,37 +159,37 @@ assertEquals("field3", f); te = fe.terms(); - assertEquals("dog", te.next().utf8ToString()); + assertEquals("dog", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("fox", te.next().utf8ToString()); + assertEquals("fox", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("jumps", te.next().utf8ToString()); + assertEquals("jumps", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("lazy", te.next().utf8ToString()); + assertEquals("lazy", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("over", te.next().utf8ToString()); + assertEquals("over", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); assertEquals(td.nextDoc(), DocsEnum.NO_MORE_DOCS); - assertEquals("the", te.next().utf8ToString()); + assertEquals("the", te.next().bocu1ToString()); td = te.docs(delDocs, td); assertTrue(td.nextDoc() != DocsEnum.NO_MORE_DOCS); assertEquals(0, td.docID()); Index: lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -70,14 +70,14 @@ SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); TermsEnum terms = reader.fields().terms("content").iterator(); assertNotNull(terms.next()); - assertEquals("aaa", terms.term().utf8ToString()); + assertEquals("aaa", terms.term().bocu1ToString()); assertNotNull(terms.next()); long ordB = terms.ord(); - assertEquals("bbb", terms.term().utf8ToString()); + assertEquals("bbb", terms.term().bocu1ToString()); assertNull(terms.next()); assertEquals(TermsEnum.SeekStatus.FOUND, terms.seek(ordB)); - assertEquals("bbb", terms.term().utf8ToString()); + assertEquals("bbb", terms.term().bocu1ToString()); } private void verifyDocFreq() @@ -90,12 +90,12 @@ // go to the first term (aaa) termEnum.next(); // assert that term is 'aaa' - assertEquals("aaa", termEnum.term().utf8ToString()); + assertEquals("aaa", termEnum.term().bocu1ToString()); assertEquals(200, termEnum.docFreq()); // go to the second term (bbb) termEnum.next(); // assert that term is 'bbb' - assertEquals("bbb", termEnum.term().utf8ToString()); + assertEquals("bbb", termEnum.term().bocu1ToString()); assertEquals(100, termEnum.docFreq()); @@ -103,12 +103,12 @@ // including 'aaa' termEnum.seek(new BytesRef("aaa")); // assert that term is 'aaa' - assertEquals("aaa", termEnum.term().utf8ToString()); + assertEquals("aaa", termEnum.term().bocu1ToString()); assertEquals(200, termEnum.docFreq()); // go to term 'bbb' termEnum.next(); // assert that term is 'bbb' - assertEquals("bbb", termEnum.term().utf8ToString()); + assertEquals("bbb", termEnum.term().bocu1ToString()); assertEquals(100, termEnum.docFreq()); } Index: lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -472,7 +472,7 @@ // now compare for (int i=0; i it = seenTerms.iterator(); while(it.hasNext()) { BytesRef tr = new BytesRef(it.next()); - assertEquals("seek failed for term=" + termDesc(tr.utf8ToString()), + assertEquals("seek failed for term=" + termDesc(tr.bocu1ToString()), TermsEnum.SeekStatus.FOUND, terms.seek(tr)); } Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestMultiFields.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java (working copy) @@ -52,7 +52,7 @@ // re-use existing term BytesRef term = terms.get(r.nextInt(terms.size())); docs.get(term).add(i); - f.setValue(term.utf8ToString()); + f.setValue(term.bocu1ToString()); } else { String s = _TestUtil.randomUnicodeString(r, 10); BytesRef term = new BytesRef(s); Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -292,7 +292,7 @@ for(int i=0;i 0); Index: lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 966583) +++ lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) @@ -645,7 +645,7 @@ assertNotNull(t); // content field only has term aaa: - assertEquals("aaa", t.utf8ToString()); + assertEquals("aaa", t.bocu1ToString()); assertNull(terms.next()); BytesRef aaaTerm = new BytesRef("aaa"); Index: lucene/src/test/org/apache/lucene/util/TestNumericUtils.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (revision 966583) +++ lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (working copy) @@ -31,7 +31,7 @@ NumericUtils.longToPrefixCoded(l, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getBOCU1SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -49,7 +49,7 @@ NumericUtils.intToPrefixCoded(i, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getBOCU1SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -85,7 +85,7 @@ // check sort order (prefixVals should be ascending) for (int i=1; i= 0 - : collator.compare(term.utf8ToString(), lowerTermText) > 0) + ? collator.compare(term.bocu1ToString(), lowerTermText) >= 0 + : collator.compare(term.bocu1ToString(), lowerTermText) > 0) && (upperTermText == null || (includeUpper - ? collator.compare(term.utf8ToString(), upperTermText) <= 0 - : collator.compare(term.utf8ToString(), upperTermText) < 0))) { + ? collator.compare(term.bocu1ToString(), upperTermText) <= 0 + : collator.compare(term.bocu1ToString(), upperTermText) < 0))) { return AcceptStatus.YES; } return AcceptStatus.NO; Index: lucene/src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldComparator.java (revision 966583) +++ lucene/src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -656,7 +656,7 @@ @Override public int compareBottom(int doc) { - final String val2 = currentDocTerms.getTerm(doc, tempBR).utf8ToString(); + final String val2 = currentDocTerms.getTerm(doc, tempBR).bocu1ToString(); if (bottom == null) { if (val2 == null) { return 0; @@ -674,7 +674,7 @@ if (br == null) { values[slot] = null; } else { - values[slot] = br.utf8ToString(); + values[slot] = br.bocu1ToString(); } } Index: lucene/src/java/org/apache/lucene/search/QueryTermVector.java =================================================================== --- lucene/src/java/org/apache/lucene/search/QueryTermVector.java (revision 966583) +++ lucene/src/java/org/apache/lucene/search/QueryTermVector.java (working copy) @@ -116,7 +116,7 @@ sb.append('{'); for (int i=0; i0) sb.append(", "); - sb.append(terms[i].utf8ToString()).append('/').append(termFreqs[i]); + sb.append(terms[i].bocu1ToString()).append('/').append(termFreqs[i]); } sb.append('}'); return sb.toString(); Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 966583) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -298,7 +298,7 @@ if (matchers[i].run(term.bytes, 0, term.length)) { // this sucks, we convert just to score based on length. if (codePointCount == -1) { - codePointCount = UnicodeUtil.codePointCount(term); + codePointCount = UnicodeUtil.codePointCountBOCU1(term); } final float similarity = 1.0f - ((float) i / (float) (Math.min(codePointCount, termLength))); @@ -384,7 +384,7 @@ @Override protected final AcceptStatus accept(BytesRef term) { if (term.startsWith(prefixBytesRef)) { - UnicodeUtil.UTF8toUTF32(term, utf32); + UnicodeUtil.BOCU1toUTF32(term, utf32); final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); if (similarity > minSimilarity) { boostAtt.setBoost((float)((similarity - minSimilarity) * scale_factor)); Index: lucene/src/java/org/apache/lucene/index/Term.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Term.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/Term.java (working copy) @@ -89,7 +89,7 @@ /** Returns the text of this term. In the case of words, this is simply the text of the word. In the case of dates and other types, this is an encoding of the object as a string. */ - public final String text() { return bytes.utf8ToString(); } + public final String text() { return bytes.bocu1ToString(); } /** Returns the bytes of this term. */ public final BytesRef bytes() { return bytes; } @@ -165,7 +165,7 @@ @Deprecated private static final Comparator legacyComparator = - BytesRef.getUTF8SortedAsUTF16Comparator(); + BytesRef.getBOCU1SortedAsUTF16Comparator(); /** * @deprecated For internal backwards compatibility use only @@ -198,7 +198,7 @@ } @Override - public final String toString() { return field + ":" + bytes.utf8ToString(); } + public final String toString() { return field + ":" + bytes.bocu1ToString(); } private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -130,7 +130,7 @@ // TODO: we may want to make this sort in same order // as Codec's terms dict? - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getBOCU1SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (working copy) @@ -80,7 +80,7 @@ // Terms dict success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -111,7 +111,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -126,7 +126,7 @@ state.dir, state.fieldInfos, state.segmentInfo.name, pulsingReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), + BytesRef.getBOCU1SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (working copy) @@ -63,7 +63,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), + BytesRef.getBOCU1SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (working copy) @@ -58,7 +58,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -85,7 +85,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -101,7 +101,7 @@ state.segmentInfo.name, postings, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), + BytesRef.getBOCU1SortedAsUnicodeComparator(), TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (working copy) @@ -67,7 +67,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + BytesRef.getBOCU1SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), + BytesRef.getBOCU1SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 966583) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -228,7 +228,7 @@ public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } } @@ -468,7 +468,7 @@ public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } @Override @@ -484,7 +484,7 @@ @Override public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { if (DEBUG_SURROGATES) { - System.out.println("TE.seek() term=" + term.utf8ToString()); + System.out.println("TE.seek() term=" + new String(term.bytes, term.offset, term.length, "UTF-8")); } skipNext = false; final TermInfosReader tis = getTermsDict(); Index: lucene/src/java/org/apache/lucene/util/NumericUtils.java =================================================================== --- lucene/src/java/org/apache/lucene/util/NumericUtils.java (revision 966583) +++ lucene/src/java/org/apache/lucene/util/NumericUtils.java (working copy) @@ -54,7 +54,7 @@ * for the same data types. * *

This class can also be used, to generate lexicographically sortable (according to - * {@link BytesRef#getUTF8SortedAsUTF16Comparator()}) representations of numeric data + * {@link BytesRef#getBOCU1SortedAsUTF16Comparator()}) representations of numeric data * types for other usages (e.g. sorting). * * @lucene.internal Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 966583) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -79,7 +79,7 @@ } /** - * @param text Initialize the byte[] from the UTF8 bytes + * @param text Initialize the byte[] from the BOCU1 bytes * for the provided array. This must be well-formed * unicode text, with no unpaired surrogates or U+FFFF. */ @@ -107,23 +107,23 @@ */ /** - * Copies the UTF8 bytes for this string. + * Copies the BOCU1 bytes for this string. * * @param text Must be well-formed unicode text, with no * unpaired surrogates or invalid UTF16 code units. */ public void copy(CharSequence text) { - UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this); + UnicodeUtil.UTF16toBOCU1(text, 0, text.length(), this); } /** - * Copies the UTF8 bytes for this string. + * Copies the BOCU1 bytes for this string. * * @param text Must be well-formed unicode text, with no * unpaired surrogates or invalid UTF16 code units. */ public void copy(char text[], int offset, int length) { - UnicodeUtil.UTF16toUTF8(text, offset, length, this); + UnicodeUtil.UTF16toBOCU1(text, offset, length, this); } public boolean bytesEquals(BytesRef other) { if (length == other.length) { @@ -198,13 +198,19 @@ /** Interprets stored bytes as UTF8 bytes, returning the * resulting string */ + public String bocu1ToString() { + UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result(); + UnicodeUtil.BOCU1toUTF16(bytes, offset, length, result); + return result.toString(); + } + + /** @deprecated do your string/byte conversion some other way */ + @Deprecated public String utf8ToString() { try { return new String(bytes, offset, length, "UTF-8"); - } catch (UnsupportedEncodingException uee) { - // should not happen -- UTF8 is presumably supported - // by all JREs - throw new RuntimeException(uee); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); } } @@ -258,15 +264,15 @@ return this.length - other.length; } - private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); + private final static Comparator bocu1SortedAsUnicodeSortOrder = new BOCU1SortedAsUnicodeComparator(); - public static Comparator getUTF8SortedAsUnicodeComparator() { - return utf8SortedAsUnicodeSortOrder; + public static Comparator getBOCU1SortedAsUnicodeComparator() { + return bocu1SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUnicodeComparator implements Comparator { + private static class BOCU1SortedAsUnicodeComparator implements Comparator { // Only singleton - private UTF8SortedAsUnicodeComparator() {}; + private BOCU1SortedAsUnicodeComparator() {}; public int compare(BytesRef a, BytesRef b) { final byte[] aBytes = a.bytes; @@ -296,55 +302,18 @@ } } - private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + private final static Comparator bocu1SortedAsUTF16SortOrder = new BOCU1SortedAsUTF16Comparator(); - public static Comparator getUTF8SortedAsUTF16Comparator() { - return utf8SortedAsUTF16SortOrder; + public static Comparator getBOCU1SortedAsUTF16Comparator() { + return bocu1SortedAsUTF16SortOrder; } - private static class UTF8SortedAsUTF16Comparator implements Comparator { + private static class BOCU1SortedAsUTF16Comparator implements Comparator { // Only singleton - private UTF8SortedAsUTF16Comparator() {}; + private BOCU1SortedAsUTF16Comparator() {}; public int compare(BytesRef a, BytesRef b) { - - final byte[] aBytes = a.bytes; - int aUpto = a.offset; - final byte[] bBytes = b.bytes; - int bUpto = b.offset; - - final int aStop; - if (a.length < b.length) { - aStop = aUpto + a.length; - } else { - aStop = aUpto + b.length; - } - - while(aUpto < aStop) { - int aByte = aBytes[aUpto++] & 0xff; - int bByte = bBytes[bUpto++] & 0xff; - - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - } - - // One is a prefix of the other, or, they are equal: - return a.length - b.length; + return a.bocu1ToString().compareTo(b.bocu1ToString()); } public boolean equals(Object other) { Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 966583) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -1,5 +1,14 @@ package org.apache.lucene.util; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; + +import com.ibm.icu.charset.CharsetICU; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -132,6 +141,10 @@ other.getChars(0, otherLength, result, 0); length = otherLength; } + + public String toString() { + return new String(result, 0, length); + } } /** Encode characters from a char[] source, starting at @@ -701,4 +714,93 @@ } return sb.toString(); } + + /** Encode characters from a char[] source, starting at + * offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */ + public static int UTF16toBOCU1WithHash(final char[] source, final int offset, final int length, BytesRef result) { + encode(bocu1, CharBuffer.wrap(source, offset, length), result); + return result.hashCode(); + } + + /** Encode characters from a char[] source, starting at + * offset for length chars. After encoding, result.offset will always be 0. + */ + public static void UTF16toBOCU1(final char[] source, final int offset, final int length, BytesRef result) { + encode(bocu1, CharBuffer.wrap(source, offset, length), result); + } + + + /** Encode characters from this String, starting at offset + * for length characters. After encoding, result.offset will always be 0. + */ + public static void UTF16toBOCU1(final CharSequence source, final int offset, final int length, BytesRef result) { + encode(bocu1, CharBuffer.wrap(source, offset, length), result); + } + + /** Convert BOCU1 bytes into UTF16 characters. If offset + * is non-zero, conversion starts at that starting point + * in utf8, re-using the results from the previous call + * up until offset. */ + public static void BOCU1toUTF16(final byte[] source, final int offset, final int length, final UTF16Result result) { + decode(bocu1, ByteBuffer.wrap(source, offset, length), result); + } + + /** Returns the number of code points in this bocu1 + * sequence. Behavior is undefined if the bocu1 sequence + * is invalid.*/ + public static int codePointCountBOCU1(BytesRef source) { + UTF16Result result = new UTF16Result(); + decode(bocu1, ByteBuffer.wrap(source.bytes, source.offset, source.length), result); + String s = result.toString(); + return s.codePointCount(0, s.length()); + } + + public static void BOCU1toUTF32(final BytesRef source, final IntsRef utf32) { + UTF16Result result = new UTF16Result(); + decode(bocu1, ByteBuffer.wrap(source.bytes, source.offset, source.length), result); + String s = result.toString(); + int codePointCount = s.codePointCount(0, s.length()); + utf32.grow(codePointCount); + utf32.offset = 0; + utf32.length = codePointCount; + for (int i = 0, j = 0; i < s.length();) { + int ch = s.codePointAt(i); + utf32.ints[j++] = ch; + i += Character.charCount(ch); + } + } + + /** crappy impl below */ + + private static final Charset bocu1 = CharsetICU.forNameICU("BOCU-1"); + + private static void decode(Charset charset, ByteBuffer bb, UTF16Result result) { + CharsetDecoder decoder = charset.newDecoder(); + CharBuffer buffer; + try { + buffer = decoder.decode(bb); + } catch (CharacterCodingException e) { + throw new RuntimeException(e); + } + result.copyText(buffer.toString()); + } + + private static void encode(Charset charset, CharBuffer cb, BytesRef result) { + CharsetEncoder encoder = charset.newEncoder(); + ByteBuffer buffer; + try { + buffer = encoder.encode(cb); + } catch (CharacterCodingException e) { + throw new RuntimeException(e); + } + if (buffer.hasArray()) { + result.bytes = buffer.array(); + result.offset = buffer.arrayOffset() + buffer.position(); + result.length = buffer.remaining(); + } else { + final int remaining = buffer.remaining(); + result.grow(remaining); + buffer.get(result.bytes, result.offset = 0, result.length = remaining); + } + } } Index: lucene/src/java/com/ibm/icu/charset/CharsetDecoderICU.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetDecoderICU.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetDecoderICU.java (revision 0) @@ -0,0 +1,723 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2010, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +/** + * An abstract class that provides framework methods of decoding operations for concrete + * subclasses. + * In the future this class will contain API that will implement converter sematics of ICU4C. + * @stable ICU 3.6 + */ +public abstract class CharsetDecoderICU extends CharsetDecoder{ + + int toUnicodeStatus; + byte[] toUBytesArray = new byte[128]; + int toUBytesBegin = 0; + int toULength; + char[] charErrorBufferArray = new char[128]; + int charErrorBufferLength; + int charErrorBufferBegin; + char[] invalidCharBuffer = new char[128]; + int invalidCharLength; + + /** + * Maximum number of indexed bytes + * @internal + * @deprecated This API is ICU internal only. + */ + protected static final int EXT_MAX_BYTES = 0x1f; + + /* store previous UChars/chars to continue partial matches */ + byte[] preToUArray = new byte[EXT_MAX_BYTES]; + int preToUBegin; + int preToULength; /* negative: replay */ + int preToUFirstLength; /* length of first character */ + int mode; + + Object toUContext = null; + private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP; + private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP; + CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source, + CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) { + if (cr.isUnmappable()) { + return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer, + length, cr); + } else /* if (cr.isMalformed()) */ { + return onMalformedInput.call(decoder, context, source, target, offsets, buffer, + length, cr); + } + // return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr); + } + }; + + // exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive + private boolean malformedInputCalled = false; + private boolean unmappableCharacterCalled = false; + + /* + * Construct a CharsetDecorderICU based on the information provided from a CharsetICU object. + * + * @param cs The CharsetICU object containing information about how to charset to decode. + */ + CharsetDecoderICU(CharsetICU cs) { + super(cs, (1/cs.maxCharsPerByte), cs.maxCharsPerByte); + } + + /* + * Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping + * that will convert a byte sequence to a Unicode codepoint sequence, but + * the encoded Unicode codepoint sequence will round trip convert to a different + * byte sequence. In ICU, this is can be called a reverse fallback. + * @return A boolean + */ + final boolean isFallbackUsed() { + return true; + } + + /** + * Fallback is currently always used by icu4j decoders. + */ + static final boolean isToUUseFallback() { + return isToUUseFallback(true); + } + + /** + * Fallback is currently always used by icu4j decoders. + */ + static final boolean isToUUseFallback(boolean iUseFallback) { + return true; + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * + * @param newAction action to be taken + * @exception IllegalArgumentException + * @stable ICU 3.6 + */ + protected final void implOnMalformedInput(CodingErrorAction newAction) { + // don't run infinitely + if (malformedInputCalled) + return; + + // if we get a replace, do not let the nio replace + if (newAction == CodingErrorAction.REPLACE) { + malformedInputCalled = true; + super.onMalformedInput(CodingErrorAction.IGNORE); + malformedInputCalled = false; + } + + onMalformedInput = getCallback(newAction); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * + * @param newAction action to be taken + * @exception IllegalArgumentException + * @stable ICU 3.6 + */ + protected final void implOnUnmappableCharacter(CodingErrorAction newAction) { + // dont run infinitely + if (unmappableCharacterCalled) + return; + + // if we get a replace, do not let the nio replace + if (newAction == CodingErrorAction.REPLACE) { + unmappableCharacterCalled = true; + super.onUnmappableCharacter(CodingErrorAction.IGNORE); + unmappableCharacterCalled = false; + } + + onUnmappableCharacter = getCallback(newAction); + } + + /** + * Sets the callback encoder method and context to be used if an illegal sequence is encounterd. + * You would normally call this twice to set both the malform and unmappable error. In this case, + * newContext should remain the same since using a different newContext each time will negate the last + * one used. + * @param err CoderResult + * @param newCallback CharsetCallback.Encoder + * @param newContext Object + * @stable ICU 4.0 + */ + public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) { + if (err.isMalformed()) { + onMalformedInput = newCallback; + } else if (err.isUnmappable()) { + onUnmappableCharacter = newCallback; + } else { + /* Error: Only malformed and unmappable are handled. */ + } + + if (toUContext == null || !toUContext.equals(newContext)) { + toUContext = newContext; + } + } + + private static CharsetCallback.Decoder getCallback(CodingErrorAction action){ + if(action==CodingErrorAction.REPLACE){ + return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE; + }else if(action==CodingErrorAction.IGNORE){ + return CharsetCallback.TO_U_CALLBACK_SKIP; + }else /* if(action==CodingErrorAction.REPORT) */ { + return CharsetCallback.TO_U_CALLBACK_STOP; + } + } + private final ByteBuffer EMPTY = ByteBuffer.allocate(0); + /** + * Flushes any characters saved in the converter's internal buffer and + * resets the converter. + * @param out action to be taken + * @return result of flushing action and completes the decoding all input. + * Returns CoderResult.UNDERFLOW if the action succeeds. + * @stable ICU 3.6 + */ + protected final CoderResult implFlush(CharBuffer out) { + return decode(EMPTY, out, null, true); + } + + /** + * Resets the to Unicode mode of converter + * @stable ICU 3.6 + */ + protected void implReset() { + toUnicodeStatus = 0 ; + toULength = 0; + charErrorBufferLength = 0; + charErrorBufferBegin = 0; + + /* store previous UChars/chars to continue partial matches */ + preToUBegin = 0; + preToULength = 0; /* negative: replay */ + preToUFirstLength = 0; + + mode = 0; + } + + /** + * Decodes one or more bytes. The default behaviour of the converter + * is stop and report if an error in input stream is encountered. + * To set different behaviour use @see CharsetDecoder.onMalformedInput() + * This method allows a buffer by buffer conversion of a data stream. + * The state of the conversion is saved between calls to convert. + * Among other things, this means multibyte input sequences can be + * split between calls. If a call to convert results in an Error, the + * conversion may be continued by calling convert again with suitably + * modified parameters.All conversions should be finished with a call to + * the flush method. + * @param in buffer to decode + * @param out buffer to populate with decoded result + * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + * @stable ICU 3.6 + */ + protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){ + if(in.remaining() < toUCountPending()){ + return CoderResult.UNDERFLOW; + } +// if (!in.hasRemaining()) { +// toULength = 0; +// return CoderResult.UNDERFLOW; +// } + + in.position(in.position() + toUCountPending()); + + /* do the conversion */ + CoderResult ret = decode(in, out, null, false); + + // ok was there input held in the previous invocation of decodeLoop + // that resulted in output in this invocation? + in.position(in.position() - toUCountPending()); + + return ret; + } + + /* + * Implements the ICU semantic for decode operation + * @param in The input byte buffer + * @param out The output character buffer + * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + */ + abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush); + + /* + * Implements the ICU semantic for decode operation + * @param source The input byte buffer + * @param target The output character buffer + * @param offsets + * @param flush true if, and only if, the invoker can provide no + * additional input bytes beyond those in the given buffer. + * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + */ + final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { + + /* check parameters */ + if (target == null || source == null) { + throw new IllegalArgumentException(); + } + + /* + * Make sure that the buffer sizes do not exceed the number range for + * int32_t because some functions use the size (in units or bytes) + * rather than comparing pointers, and because offsets are int32_t values. + * + * size_t is guaranteed to be unsigned and large enough for the job. + * + * Return with an error instead of adjusting the limits because we would + * not be able to maintain the semantics that either the source must be + * consumed or the target filled (unless an error occurs). + * An adjustment would be sourceLimit=t+0x7fffffff; for example. + */ + /*agljport:fix + if( + ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) || + ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t) + ) { + *err=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + */ + + /* flush the target overflow buffer */ + if (charErrorBufferLength > 0) { + int i = 0; + do { + if (!target.hasRemaining()) { + /* the overflow buffer contains too much, keep the rest */ + int j = 0; + + do { + charErrorBufferArray[j++] = charErrorBufferArray[i++]; + } while (i < charErrorBufferLength); + + charErrorBufferLength = (byte) j; + return CoderResult.OVERFLOW; + } + + /* copy the overflow contents to the target */ + target.put(charErrorBufferArray[i++]); + if (offsets != null) { + offsets.put(-1); /* no source index available for old output */ + } + } while (i < charErrorBufferLength); + + /* the overflow buffer is completely copied to the target */ + charErrorBufferLength = 0; + } + + if (!flush && !source.hasRemaining() && toULength == 0 && preToULength >= 0) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return CoderResult.UNDERFLOW; + } + + /* + * Do not simply return with a buffer overflow error if + * !flush && t==targetLimit + * because it is possible that the source will not generate any output. + * For example, the skip callback may be called; + * it does not output anything. + */ + + return toUnicodeWithCallback(source, target, offsets, flush); + } + + /* Currently, we are not using offsets in ICU4J. */ + /* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) { + int limit; + int delta, offset; + + if(sourceIndex>=0) { + /* + * adjust each offset by adding the previous sourceIndex + * minus the length of the input sequence that caused an + * error, if any + */ + /* delta=sourceIndex-errorInputLength; + } else { + /* + * set each offset to -1 because this conversion function + * does not handle offsets + */ + /* delta=-1; + } + limit=offsets.position()+length; + if(delta==0) { + /* most common case, nothing to do */ + /* } else if(delta>0) { + /* add the delta to each offset (but not if the offset is <0) */ + /* while(offsets.position()=0) { + offsets.put(offset+delta); + } + //FIXME: ++offsets; + } + } else /* delta<0 */ /* { + /* + * set each offset to -1 because this conversion function + * does not handle offsets + * or the error input sequence started in a previous buffer + */ + /* while(offsets.position()=0) { + /* normal mode */ + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource=source; + realFlush=flush; + realSourceIndex=sourceIndex; + //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); + replayArray.put(preToUArray,0, -preToULength); + source=replayArray; + source.position(0); + source.limit(replayArrayIndex-preToULength); + flush=false; + sourceIndex=-1; + preToULength=0; + } + + /* + * loop for conversion and error handling + * + * loop { + * convert + * loop { + * update offsets + * handle end of input + * handle errors/call callback + * } + * } + */ + for(;;) { + + /* convert */ + cr = decodeLoop(source, target, offsets, flush); + + /* + * set a flag for whether the converter + * successfully processed the end of the input + * + * need not check cnv->preToULength==0 because a replay (<0) will cause + * s0) { + updateOffsets(offsets, length, sourceIndex, errorInputLength); + + + /* + * if a converter handles offsets and updates the offsets + * pointer at the end, then pArgs->offset should not change + * here; + * however, some converters do not handle offsets at all + * (sourceIndex<0) or may not update the offsets pointer + */ + //TODO: pArgs->offsets=offsets+=length; + /* } + + if(sourceIndex>=0) { + sourceIndex+=(source.position()-s); + } + + } */ + + if(preToULength<0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if(realSource==null) + { + realSource=source; + realFlush=flush; + realSourceIndex=sourceIndex; + + //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); + replayArray.put(preToUArray,0, -preToULength); + // reset position + replayArray.position(0); + + source=replayArray; + source.limit(replayArrayIndex-preToULength); + flush=false; + if((sourceIndex+=preToULength)<0) { + sourceIndex=-1; + } + + preToULength=0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + //agljport:todo U_ASSERT(realSource==NULL); + assert (realSource==null); + } + } + + /* update pointers */ + s=source.position(); + //t=target.position(); + + if(cr.isUnderflow()) { + if(s0) { + /* + * the entire input stream is consumed + * and there is a partial, truncated input sequence left + */ + + /* inject an error and continue with callback handling */ + cr = CoderResult.malformedForLength(toULength); + calledCallback=false; /* new error condition */ + } else { + /* input consumed */ + if(flush) { + /* + * return to the conversion loop once more if the flush + * flag is set and the conversion function has not + * successfully processed the end of the input yet + * + * (continue converting by breaking out of only the inner loop) + */ + if(!converterSawEndOfInput) { + break; + } + + /* reset the converter without calling the callback function */ + implReset(); + } + + /* done successfully */ + return cr; + } + } + + /* U_FAILURE(*err) */ + { + + if( calledCallback || cr.isOverflow() || + (cr.isMalformed() && cr.isUnmappable()) + ) { + /* + * the callback did not or cannot resolve the error: + * set output pointers and return + * + * the check for buffer overflow is redundant but it is + * a high-runner case and hopefully documents the intent + * well + * + * if we were replaying, then the replay buffer must be + * copied back into the UConverter + * and the real arguments must be restored + */ + if(realSource!=null) { + int length; + assert (preToULength==0); + length = source.limit() - source.position(); + if(length>0) { + //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length); + source.get(preToUArray, preToUBegin, length); + preToULength=(byte)-length; + } + + source=realSource; + flush=realFlush; + } + return cr; + } + } + + /* copy toUBytes[] to invalidCharBuffer[] */ + errorInputLength=invalidCharLength=toULength; + if(errorInputLength>0) { + copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength); + } + + /* set the converter state to deal with the next character */ + toULength=0; + + /* call the callback function */ + cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr); + /* + * loop back to the offset handling + * + * this flag will indicate after offset handling + * that a callback was called; + * if the callback did not resolve the error, then we return + */ + calledCallback=true; + } + } + } + + /* + * Returns the number of chars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @return The number of chars in the state. -1 if an error is encountered. + */ + /*public*/ int toUCountPending() { + if(preToULength > 0){ + return preToULength ; + } else if(preToULength < 0){ + return -preToULength; + } else if(toULength > 0){ + return toULength; + } else { + return 0; + } + } + + + private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) { + for(int i=srcOffset; i0 && target.hasRemaining()) { + target.put(ucharsArray[ucharsBegin++]); + --length; + } + + } else { + /* output with offsets */ + while(length>0 && target.hasRemaining()) { + target.put(ucharsArray[ucharsBegin++]); + offsets.put(sourceIndex); + --length; + } + } + /* write overflow */ + if(length>0) { + cnv.charErrorBufferLength= 0; + cr = CoderResult.OVERFLOW; + do { + cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++]; + } while(--length>0); + } + return cr; + } + /* + * This function will write out the Unicode substitution character to the + * target character buffer. + * Sub classes to override this method if required + * @param decoder + * @param source + * @param target + * @param offsets + * @return A CoderResult object that contains the error result when an error occurs. + */ + /* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with + * the substitution characters. Will leave in here for the time being. To be removed later. (4.0) + */ + /*CoderResult cbToUWriteSub(CharsetDecoderICU decoder, + ByteBuffer source, CharBuffer target, + IntBuffer offsets){ + String sub = decoder.replacement(); + CharsetICU cs = (CharsetICU) decoder.charset(); + if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) { + char[] subArr = new char[] { 0x1a }; + return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub + .length(), target, offsets, source.position()); + } else { + return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(), + 0, sub.length(), target, offsets, source.position()); + + } + }*/ +} Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetDecoderICU.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/CharsetBOCU1.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetBOCU1.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetBOCU1.java (revision 0) @@ -0,0 +1,1054 @@ +/* + ******************************************************************************* + * Copyright (C) 2008-2010, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * @author krajwade + * + */ +class CharsetBOCU1 extends CharsetICU { + /* BOCU constants and macros */ + + /* initial value for "prev": middle of the ASCII range */ + private static final byte BOCU1_ASCII_PREV = 0x40; + + /* bounding byte values for differences */ + private static final int BOCU1_MIN = 0x21; + private static final int BOCU1_MIDDLE = 0x90; + //private static final int BOCU1_MAX_LEAD = 0xfe; + private static final int BOCU1_MAX_TRAIL = 0xff; + private static final int BOCU1_RESET = 0xff; + + /* number of lead bytes */ + //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1); + + /* adjust trail byte counts for the use of some C0 control byte values */ + private static final int BOCU1_TRAIL_CONTROLS_COUNT = 20; + private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT); + + /* number of trail bytes */ + private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT); + + /* + * number of positive and negative single-byte codes + * (counting 0==BOCU1_MIDDLE among the positive ones) + */ + private static final int BOCU1_SINGLE = 64; + + /* number of lead bytes for positive and negative 2/3/4-byte sequences */ + private static final int BOCU1_LEAD_2 = 43; + private static final int BOCU1_LEAD_3 = 3; + //private static final int BOCU1_LEAD_4 = 1; + + /* The difference value range for single-byters. */ + private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1); + private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE); + + /* The difference value range for double-byters. */ + private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT); + private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT); + + /* The difference value range for 3-byters. */ + private static final int BOCU1_REACH_POS_3 = + (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + + private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + + /* The lead byte start values. */ + private static final int BOCU1_START_POS_2 = (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1); + private static final int BOCU1_START_POS_3 = (BOCU1_START_POS_2+BOCU1_LEAD_2); + private static final int BOCU1_START_POS_4 = (BOCU1_START_POS_3+BOCU1_LEAD_3); + /* ==BOCU1_MAX_LEAD */ + + private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1); + private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2); + //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3); + /* ==BOCU1_MIN+1 */ + + /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ + /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) { + return ((BOCU1_START_NEG_2<=(lead) && (lead)>24 : 4); + } + + /* + * Byte value map for control codes, + * from external byte values 0x00..0x20 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation. + * External byte values that are illegal as trail bytes are mapped to -1. + */ + private static final int[] + bocu1ByteToTrail={ + /* 0 1 2 3 4 5 6 7 */ + -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, + + /* 8 9 a b c d e f */ + -1, -1, -1, -1, -1, -1, -1, -1, + + /* 10 11 12 13 14 15 16 17 */ + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + + /* 18 19 1a 1b 1c 1d 1e 1f */ + 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, + + /* 20 */ + -1 + }; + + /* + * Byte value map for control codes, + * from trail byte values 0..19 (0..0x13) as used in the difference calculation + * to external byte values 0x00..0x20. + */ + private static final int[] + bocu1TrailToByte = { + /* 0 1 2 3 4 5 6 7 */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, + + /* 8 9 a b c d e f */ + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + + /* 10 11 12 13 */ + 0x1c, 0x1d, 0x1e, 0x1f + }; + + + /* + * 12 commonly used C0 control codes (and space) are only used to encode + * themselves directly, + * which makes BOCU-1 MIME-usable and reasonably safe for + * ASCII-oriented software. + * + * These controls are + * 0 NUL + * + * 7 BEL + * 8 BS + * + * 9 TAB + * a LF + * b VT + * c FF + * d CR + * + * e SO + * f SI + * + * 1a SUB + * 1b ESC + * + * The other 20 C0 controls are also encoded directly (to preserve order) + * but are also used as trail bytes in difference encoding + * (for better compression). + */ + private static int BOCU1_TRAIL_TO_BYTE(int trail) { + return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]); + } + + /* BOCU-1 implementation functions ------------------------------------------ */ + private static int BOCU1_SIMPLE_PREV(int c){ + return (((c)&~0x7f)+BOCU1_ASCII_PREV); + } + + /** + * Compute the next "previous" value for differencing + * from the current code point. + * + * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) + * @return "previous code point" state value + */ + private static int bocu1Prev(int c) { + /* compute new prev */ + if(/* 0x3040<=c && */ c<=0x309f) { + /* Hiragana is not 128-aligned */ + return 0x3070; + } else if(0x4e00<=c && c<=0x9fa5) { + /* CJK Unihan */ + return 0x4e00-BOCU1_REACH_NEG_2; + } else if(0xac00<=c /* && c<=0xd7a3 */) { + /* Korean Hangul */ + return (0xd7a3+0xac00)/2; + } else { + /* mostly small scripts */ + return BOCU1_SIMPLE_PREV(c); + } + } + + /** Fast version of bocu1Prev() for most scripts. */ + private static int BOCU1_PREV(int c) { + return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)); + } + + protected byte[] fromUSubstitution = new byte[]{(byte)0x1A}; + + /* Faster versions of packDiff() for single-byte-encoded diff values. */ + + /** Is a diff value encodable in a single byte? */ + private static boolean DIFF_IS_SINGLE(int diff){ + return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1); + } + + /** Encode a diff value in a single byte. */ + private static int PACK_SINGLE_DIFF(int diff){ + return (BOCU1_MIDDLE+(diff)); + } + + /** Is a diff value encodable in two bytes? */ + private static boolean DIFF_IS_DOUBLE(int diff){ + return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2); + } + + public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 1; + maxCharsPerByte = 1; + } + + class CharsetEncoderBOCU extends CharsetEncoderICU { + public CharsetEncoderBOCU(CharsetICU cs) { + super(cs,fromUSubstitution); + } + + int sourceIndex, nextSourceIndex; + int prev, c , diff; + boolean checkNegative; + boolean LoopAfterTrail; + int targetCapacity; + CoderResult cr; + + /* label values for supporting behavior similar to goto in C */ + private static final int fastSingle=0; + private static final int getTrail=1; + private static final int regularLoop=2; + + private boolean LabelLoop; //used to break the while loop + private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle: + + /** + * Integer division and modulo with negative numerators + * yields negative modulo results and quotients that are one more than + * what we need here. + * This macro adjust the results so that the modulo-value m is always >=0. + * + * For positive n, the if() condition is always FALSE. + * + * @param n Number to be split into quotient and rest. + * Will be modified to contain the quotient. + * @param d Divisor. + * @param m Output variable for the rest (modulo result). + */ + private int NEGDIVMOD(int n, int d, int m) { + diff = n; + (m)=(diff)%(d); + (diff)/=(d); + if((m)<0) { + --(diff); + (m)+=(d); + } + return m; + } + + /** + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes + * and return a packed integer with them. + * + * The encoding favors small absolute differences with short encodings + * to compress runs of same-script characters. + * + * Optimized version with unrolled loops and fewer floating-point operations + * than the standard packDiff(). + * + * @param diff difference value -0x10ffff..0x10ffff + * @return + * 0x010000zz for 1-byte sequence zz + * 0x0200yyzz for 2-byte sequence yy zz + * 0x03xxyyzz for 3-byte sequence xx yy zz + * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) + */ + private int packDiff(int n) { + int result, m = 0; + diff = n; + + if(diff>=BOCU1_REACH_NEG_1) { + /* mostly positive differences, and single-byte negative ones */ + if(diff<=BOCU1_REACH_POS_2) { + /* two bytes */ + diff-=BOCU1_REACH_POS_1+1; + result=0x02000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_POS_2+diff)<<8; + } else if(diff<=BOCU1_REACH_POS_3) { + /* three bytes */ + diff-=BOCU1_REACH_POS_2+1; + result=0x03000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_POS_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_POS_3+1; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that / and % would deliver quotient 0 and rest=diff. + * Avoid division and modulo for performance. + */ + result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; + + result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24; + } + } else { + /* two- to four-byte negative differences */ + if(diff>=BOCU1_REACH_NEG_2) { + /* two bytes */ + diff-=BOCU1_REACH_NEG_1; + result=0x02000000; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_NEG_2+diff)<<8; + } else if(diff>=BOCU1_REACH_NEG_3) { + /* three bytes */ + diff-=BOCU1_REACH_NEG_2; + result=0x03000000; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_NEG_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_NEG_3; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result=BOCU1_TRAIL_TO_BYTE(m); + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that NEGDIVMOD would deliver + * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. + * Avoid division and modulo for performance. + */ + m=diff+BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<16; + + result|=BOCU1_MIN<<24; + } + } + return result; + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ + cr = CoderResult.UNDERFLOW; + + LabelLoop = true; //used to break the while loop + checkNegative = false; // its value is set to true to get out of while loop when c = -c + LoopAfterTrail = false; // its value is set to true to ignore code before getTrail: + + /*set up the local pointers*/ + targetCapacity = target.limit() - target.position(); + c = fromUChar32; + prev = fromUnicodeStatus; + + if(prev==0){ + prev = BOCU1_ASCII_PREV; + } + + /*sourceIndex ==-1 if the current characte began in the previous buffer*/ + sourceIndex = c == 0 ? 0: -1; + nextSourceIndex = 0; + + /*conversion loop*/ + if(c!=0 && targetCapacity>0){ + labelType = getTrail; + } + + while(LabelLoop){ + switch(labelType){ + case fastSingle: + labelType = fastSingle(source, target, offsets); + break; + case getTrail: + labelType = getTrail(source, target, offsets); + break; + case regularLoop: + labelType = regularLoop(source, target, offsets); + break; + } + } + + return cr; + } + + private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){ +//fastSingle: + /*fast loop for single-byte differences*/ + /*use only one loop counter variable , targetCapacity, not also source*/ + diff = source.limit() - source.position(); + if(targetCapacity>diff){ + targetCapacity = diff; + } + while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){ + if(c<=0x20){ + if(c!=0x20){ + prev = BOCU1_ASCII_PREV; + } + target.put((byte)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + source.position(source.position()+1); + --targetCapacity; + }else { + diff = c-prev; + if(DIFF_IS_SINGLE(diff)){ + prev = BOCU1_SIMPLE_PREV(c); + target.put((byte)PACK_SINGLE_DIFF(diff)); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + source.position(source.position()+1); + --targetCapacity; + }else { + break; + } + } + } + return regularLoop; + } + + private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + if(source.hasRemaining()){ + /*test the following code unit*/ + char trail = source.get(source.position()); + if(Character.isLowSurrogate(trail)){ + source.position(source.position()+1); + ++nextSourceIndex; + c=Character.toCodePoint((char)c, trail); + } + } else { + /*no more input*/ + c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/ + checkNegative = true; + } + LoopAfterTrail = true; + return regularLoop; + } + + @SuppressWarnings("fallthrough") + private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + if(!LoopAfterTrail){ + /*restore real values*/ + targetCapacity = target.limit()-target.position(); + sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/ + } + /*regular loop for all classes*/ + while(LoopAfterTrail || source.hasRemaining()){ + if(LoopAfterTrail || targetCapacity>0){ + + if(!LoopAfterTrail){ + c = source.get(); + ++nextSourceIndex; + + if(c<=0x20){ + /* + * ISO C0 control & space: + * Encode directly for MIME compatibility, + * and reset state except for space, to not disrupt compression. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + target.put((byte)c); + if(offsets != null){ + offsets.put(sourceIndex++); + } + --targetCapacity; + + sourceIndex=nextSourceIndex; + continue; + } + + if(Character.isHighSurrogate((char)c)){ + getTrail(source, target, offsets); + if(checkNegative){ + break; + } + } + } + + if(LoopAfterTrail){ + LoopAfterTrail = false; + } + + /* + * all other Unicode code points c==U+0021..U+10ffff + * are encoded with the difference c-prev + * + * a new prev is computed from c, + * placed in the middle of a 0x80-block (for most small scripts) or + * in the middle of the Unihan and Hangul blocks + * to statistically minimize the following difference + */ + diff = c- prev; + prev = BOCU1_PREV(c); + if(DIFF_IS_SINGLE(diff)){ + target.put((byte)PACK_SINGLE_DIFF(diff)); + if(offsets!=null){ + offsets.put(sourceIndex++); + } + --targetCapacity; + sourceIndex=nextSourceIndex; + if(c<0x3000){ + labelType = fastSingle; + return labelType; + } + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){ + /*optimize 2 byte case*/ + int m = 0; + if(diff>=0){ + diff -= BOCU1_REACH_POS_1 +1; + m = diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + diff+=BOCU1_START_POS_2; + } else { + diff -= BOCU1_REACH_NEG_1; + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + diff+=BOCU1_START_NEG_2; + } + target.put((byte)diff); + target.put((byte)BOCU1_TRAIL_TO_BYTE(m)); + if(offsets!=null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + targetCapacity -= 2; + sourceIndex = nextSourceIndex; + } else { + int length; /*will be 2..4*/ + diff = packDiff(diff); + length = BOCU1_LENGTH_FROM_PACKED(diff); + + /*write the output character bytes from diff and length*/ + /*from the first if in the loop we know that targetCapacity>0*/ + if(length<=targetCapacity){ + switch(length){ + /*each branch falls through the next one*/ + case 4: + target.put((byte)(diff>>24)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 3: + target.put((byte)(diff>>16)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(diff>>8)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + /*case 1 handled above*/ + target.put((byte)diff); + if(offsets!= null){ + offsets.put(sourceIndex); + } + default: + /*will never occur*/ + break; + } + targetCapacity -= length; + sourceIndex = nextSourceIndex; + } else { + ByteBuffer error = ByteBuffer.wrap(errorBuffer); + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target. + */ + /* we know that 1<=targetCapacity>16)); + case 2: + error.put((byte)(diff>>8)); + case 1: + error.put((byte)diff); + default: + /* will never occur */ + break; + } + errorBufferLength = length; + + /* now output what fits into the regular target */ + diff>>=8*length; /* length was reduced by targetCapacity */ + switch(targetCapacity) { + /* each branch falls through to the next one */ + case 3: + target.put((byte)(diff>>16)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(diff>>8)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 1: + target.put((byte)diff); + if(offsets!= null){ + offsets.put(sourceIndex); + } + default: + /* will never occur */ + break; + } + + /* target overflow */ + targetCapacity=0; + cr = CoderResult.OVERFLOW; + break; + } + } + } else{ + /*target is full*/ + cr = CoderResult.OVERFLOW; + break; + } + + } + /*set the converter state back into UConverter*/ + fromUChar32 = c<0 ? -c :0; + fromUnicodeStatus = prev; + LabelLoop = false; + labelType = fastSingle; + return labelType; + } + + } + + class CharsetDecoderBOCU extends CharsetDecoderICU{ + public CharsetDecoderBOCU(CharsetICU cs) { + super(cs); + } + + int byteIndex; + int sourceIndex, nextSourceIndex; + int prev, c , diff, count; + byte[] bytes; + int targetCapacity; + CoderResult cr; + + /* label values for supporting behavior similar to goto in C */ + private static final int fastSingle=0; + private static final int getTrail=1; + private static final int regularLoop=2; + private static final int endLoop=3; + + private boolean LabelLoop;//used to break the while loop + private boolean afterTrail; // its value is set to true to ignore code after getTrail: + private int labelType; + /* + * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. + * The UConverter fields are used as follows: + * + * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * + * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) + */ + + /* BOCU-1-from-Unicode conversion functions --------------------------------- */ + + + + /** + * Function for BOCU-1 decoder; handles multi-byte lead bytes. + * + * @param b lead byte; + * BOCU1_MIN<=b= BOCU1_START_NEG_2) { + /* positive difference */ + if(b < BOCU1_START_POS_3) { + /* two bytes */ + diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1; + countValue = 1; + } else if(b < BOCU1_START_POS_4) { + /* three bytes */ + diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; + countValue = 2; + } else { + /* four bytes */ + diffValue = BOCU1_REACH_POS_3+1; + countValue = 3; + } + } else { + /* negative difference */ + if(b >= BOCU1_START_NEG_3) { + /* two bytes */ + diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; + countValue=1; + } else if(b>BOCU1_MIN) { + /* three bytes */ + diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2; + countValue = 2; + } else { + /* four bytes */ + diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; + countValue=3; + } + } + + /* return the state for decoding the trail byte(s) */ + return (diffValue<<2)|countValue; + } + + /** + * Function for BOCU-1 decoder; handles multi-byte trail bytes. + * + * @param count number of remaining trail bytes including this one + * @param b trail byte + * @return new delta for diff including b - <0 indicates an error + * + * @see decodeBocu1 + */ + private int decodeBocu1TrailByte(int countValue, int b) { + b = b&UConverterConstants.UNSIGNED_BYTE_MASK; + if((b)<=0x20) { + /* skip some C0 controls and make the trail byte range contiguous */ + b = bocu1ByteToTrail[b]; + /* b<0 for an illegal trail byte value will result in return<0 below */ + } else { + //b-= BOCU1_TRAIL_BYTE_OFFSET; + b = b - BOCU1_TRAIL_BYTE_OFFSET; + } + + /* add trail byte into difference and decrement count */ + if(countValue==1) { + return b; + } else if(countValue==2) { + return b*BOCU1_TRAIL_COUNT; + } else /* count==3 */ { + return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + } + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, + boolean flush){ + cr = CoderResult.UNDERFLOW; + + LabelLoop = true; + afterTrail = false; + labelType = fastSingle; // labelType is set to fastSingle so t + + /*get the converter state*/ + prev = toUnicodeStatus; + + if(prev==0){ + prev = BOCU1_ASCII_PREV; + } + diff = mode; + count = diff&3; + diff>>=2; + + byteIndex = toULength; + bytes = toUBytesArray; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex=byteIndex==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ + if(count>0 && byteIndex>0 && target.position()diff) { + count = diff; + } + while(count>0) { + if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) { + c = prev + (c-BOCU1_MIDDLE); + if(c<0x3000) { + target.put((char)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + prev = BOCU1_SIMPLE_PREV(c); + } else { + break; + } + } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) { + if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) { + prev = BOCU1_ASCII_PREV; + } + target.put((char)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + } else { + break; + } + source.position(source.position()+1); + --count; + } + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ + return labelType; + } + + private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + labelType = regularLoop; + for(;;) { + if(source.position() >= source.limit()) { + labelType = endLoop; + return labelType; + } + ++nextSourceIndex; + c = bytes[byteIndex++] = source.get(); + + /* trail byte in any position */ + c = decodeBocu1TrailByte(count, c); + if(c<0) { + cr = CoderResult.malformedForLength(1); + labelType = endLoop; + return labelType; + } + + diff+=c; + if(--count==0) { + /* final trail byte, deliver a code point */ + byteIndex=0; + c = prev + diff; + if(c > 0x10ffff) { + cr = CoderResult.malformedForLength(1); + labelType = endLoop; + return labelType; + } + break; + } + } + afterTrail = true; + return labelType; + + } + + private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + /* decode a sequence of single and lead bytes */ + while(afterTrail || source.hasRemaining()) { + if(!afterTrail){ + if(target.position() >= target.limit()) { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + + ++nextSourceIndex; + c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK; + if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) { + /* Write a code point directly from a single-byte difference. */ + c = prev + (c-BOCU1_MIDDLE); + if(c<0x3000) { + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + prev = BOCU1_SIMPLE_PREV(c); + sourceIndex = nextSourceIndex; + labelType = fastSingle; + return labelType; + } + } else if(c <= 0x20) { + /* + * Direct-encoded C0 control code or space. + * Reset prev for C0 control codes but not for space. + */ + if(c != 0x20) { + prev=BOCU1_ASCII_PREV; + } + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + sourceIndex=nextSourceIndex; + continue; + } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) { + /* Optimize two-byte case. */ + if(c >= BOCU1_MIDDLE) { + diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1; + } else { + diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; + } + + /* trail byte */ + ++nextSourceIndex; + c = decodeBocu1TrailByte(1, source.get()); + if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) { + bytes[0]= source.get(source.position()-2); + bytes[1]= source.get(source.position()-1); + byteIndex = 2; + cr = CoderResult.malformedForLength(2); + break; + } + } else if(c == BOCU1_RESET) { + /* only reset the state, no code point */ + prev=BOCU1_ASCII_PREV; + sourceIndex=nextSourceIndex; + continue; + } else { + /* + * For multi-byte difference lead bytes, set the decoder state + * with the partial difference value from the lead byte and + * with the number of trail bytes. + */ + bytes[0]= (byte)c; + byteIndex = 1; + + diff = decodeBocu1LeadByte(c); + count = diff&3; + diff>>=2; + getTrail(source, target, offsets); + if(labelType != regularLoop){ + return labelType; + } + } + } + + if(afterTrail){ + afterTrail = false; + } + + /* calculate the next prev and output c */ + prev = BOCU1_PREV(c); + if(c<=0xffff) { + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + } else { + /* output surrogate pair */ + target.put(UTF16.getLeadSurrogate(c)); + if(target.hasRemaining()) { + target.put(UTF16.getTrailSurrogate(c)); + if(offsets!=null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + } else { + /* target overflow */ + if(offsets!=null){ + offsets.put(sourceIndex); + } + charErrorBufferArray[0] = UTF16.getTrailSurrogate(c); + charErrorBufferLength = 1; + cr = CoderResult.OVERFLOW; + break; + } + } + sourceIndex=nextSourceIndex; + } + labelType = endLoop; + return labelType; + } + + private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + if(cr.isMalformed()) { + /* set the converter state in UConverter to deal with the next character */ + toUnicodeStatus = BOCU1_ASCII_PREV; + mode = 0; + } else { + /* set the converter state back into UConverter */ + toUnicodeStatus=prev; + mode=(diff<<2)|count; + } + toULength=byteIndex; + LabelLoop = false; + } + + } + + + public CharsetDecoder newDecoder() { + return new CharsetDecoderBOCU(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderBOCU(this); + } +} Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetBOCU1.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/UTF16.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/UTF16.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/UTF16.java (revision 0) @@ -0,0 +1,2105 @@ +/** + ******************************************************************************* + * Copyright (C) 1996-2010, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +package com.ibm.icu.charset; + +/** + *

+ * Standalone utility class providing UTF16 character conversions and indexing conversions. + *

+ *

+ * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, + * so searching for strings is a safe operation. Similarly, concatenation is always safe. + * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the + * values for start and end are on those boundaries, since they arose from operations like + * searching. If not, the nearest UTF-32 boundaries can be determined using bounds(). + *

+ * Examples: + *

+ * The following examples illustrate use of some of these methods. + * + *

+ * // iteration forwards: Original
+ * for (int i = 0; i < s.length(); ++i) {
+ *     char ch = s.charAt(i);
+ *     doSomethingWith(ch);
+ * }
+ * 
+ * // iteration forwards: Changes for UTF-32
+ * int ch;
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ *     ch = UTF16.charAt(s, i);
+ *     doSomethingWith(ch);
+ * }
+ * 
+ * // iteration backwards: Original
+ * for (int i = s.length() - 1; i >= 0; --i) {
+ *     char ch = s.charAt(i);
+ *     doSomethingWith(ch);
+ * }
+ * 
+ * // iteration backwards: Changes for UTF-32
+ * int ch;
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ *     ch = UTF16.charAt(s, i);
+ *     doSomethingWith(ch);
+ * }
+ * 
+ * + * Notes: + *
    + *
  • Naming: For clarity, High and Low surrogates are called Lead + * and Trail in the API, which gives a better sense of their ordering in a string. + * offset16 and offset32 are used to distinguish offsets to UTF-16 + * boundaries vs offsets to UTF-32 boundaries. int char32 is used to contain UTF-32 + * characters, as opposed to char16, which is a UTF-16 code unit.
  • + *
  • Roundtripping Offsets: You can always roundtrip from a UTF-32 offset to a + * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 + * offset to a UTF-32 offset and back if and only if bounds(string, offset16) != TRAIL. + *
  • + *
  • Exceptions: The error checking will throw an exception if indices are out + * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates + * or out-of-bounds UTF-32 values are present. UCharacter.isLegal() can be used to + * check for validity if desired.
  • + *
  • Unmatched Surrogates: If the string contains unmatched surrogates, then + * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It + * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, + * 5.5).
  • + *
  • Optimization: The method implementations may need optimization if the + * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small + * percentage of all the text in the world, the singleton case should always be optimized for.
  • + *
+ * + * @author Mark Davis, with help from Markus Scherer + * @stable ICU 2.1 + */ + +public final class UTF16 { + // public variables --------------------------------------------------- + + /** + * Value returned in + * bounds(). + * These values are chosen specifically so that it actually represents the position of the + * character [offset16 - (value >> 2), offset16 + (value & 3)] + * + * @stable ICU 2.1 + */ + public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, + TRAIL_SURROGATE_BOUNDARY = 5; + + /** + * The lowest Unicode code point value. + * + * @stable ICU 2.1 + */ + public static final int CODEPOINT_MIN_VALUE = 0; + + /** + * The highest Unicode code point value (scalar value) according to the Unicode Standard. + * + * @stable ICU 2.1 + */ + public static final int CODEPOINT_MAX_VALUE = 0x10ffff; + + /** + * The minimum value for Supplementary code points + * + * @stable ICU 2.1 + */ + public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; + + /** + * Lead surrogate minimum value + * + * @stable ICU 2.1 + */ + public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; + + /** + * Trail surrogate minimum value + * + * @stable ICU 2.1 + */ + public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; + + /** + * Lead surrogate maximum value + * + * @stable ICU 2.1 + */ + public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; + + /** + * Trail surrogate maximum value + * + * @stable ICU 2.1 + */ + public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; + + /** + * Surrogate minimum value + * + * @stable ICU 2.1 + */ + public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; + + /** + * Maximum surrogate value + * + * @stable ICU 2.1 + */ + public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; + + /** + * Lead surrogate bitmask + */ + private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; + + /** + * Trail surrogate bitmask + */ + private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; + + /** + * Surrogate bitmask + */ + private static final int SURROGATE_BITMASK = 0xFFFFF800; + + /** + * Lead surrogate bits + */ + private static final int LEAD_SURROGATE_BITS = 0xD800; + + /** + * Trail surrogate bits + */ + private static final int TRAIL_SURROGATE_BITS = 0xDC00; + + /** + * Surrogate bits + */ + private static final int SURROGATE_BITS = 0xD800; + + // constructor -------------------------------------------------------- + + // /CLOVER:OFF + /** + * Prevent instance from being created. + */ + private UTF16() { + } + + // /CLOVER:ON + // public method ------------------------------------------------------ + + /** + * Determines how many chars this char32 requires. If a validity check is required, use + * isLegal() + * on char32 before calling. + * + * @param char32 The input codepoint. + * @return 2 if is in supplementary space, otherwise 1. + * @stable ICU 2.1 + */ + public static int getCharCount(int char32) { + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + return 1; + } + return 2; + } + + /** + * Returns the type of the boundaries around the char at offset16. Used for random access. + * + * @param source Text to analyse + * @param offset16 UTF-16 offset + * @return + *
    + *
  • SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] + *
  • LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds + * are [offset16, offset16 + 2] + *
  • TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the + * bounds are [offset16 - 1, offset16 + 1] + *
+ * For bit-twiddlers, the return values for these are chosen so that the boundaries + * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. + * @exception IndexOutOfBoundsException If offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int bounds(String source, int offset16) { + char ch = source.charAt(offset16); + if (isSurrogate(ch)) { + if (isLeadSurrogate(ch)) { + if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { + return LEAD_SURROGATE_BOUNDARY; + } + } else { + // isTrailSurrogate(ch), so + --offset16; + if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { + return TRAIL_SURROGATE_BOUNDARY; + } + } + } + return SINGLE_CHAR_BOUNDARY; + } + + /** + * Returns the type of the boundaries around the char at offset16. Used for random access. + * + * @param source String buffer to analyse + * @param offset16 UTF16 offset + * @return + *
    + *
  • SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] + *
  • LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds + * are [offset16, offset16 + 2] + *
  • TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the + * bounds are [offset16 - 1, offset16 + 1] + *
+ * For bit-twiddlers, the return values for these are chosen so that the boundaries + * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. + * @exception IndexOutOfBoundsException If offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int bounds(StringBuffer source, int offset16) { + char ch = source.charAt(offset16); + if (isSurrogate(ch)) { + if (isLeadSurrogate(ch)) { + if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { + return LEAD_SURROGATE_BOUNDARY; + } + } else { + // isTrailSurrogate(ch), so + --offset16; + if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { + return TRAIL_SURROGATE_BOUNDARY; + } + } + } + return SINGLE_CHAR_BOUNDARY; + } + + /** + * Returns the type of the boundaries around the char at offset16. Used for random access. Note + * that the boundaries are determined with respect to the subarray, hence the char array + * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. + * + * @param source Char array to analyse + * @param start Offset to substring in the source array for analyzing + * @param limit Offset to substring in the source array for analyzing + * @param offset16 UTF16 offset relative to start + * @return + *
    + *
  • SINGLE_CHAR_BOUNDARY : a single char; the bounds are + *
  • LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds + * are [offset16, offset16 + 2] + *
  • TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the + * bounds are [offset16 - 1, offset16 + 1] + *
+ * For bit-twiddlers, the boundary values for these are chosen so that the boundaries + * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. + * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. + * @stable ICU 2.1 + */ + public static int bounds(char source[], int start, int limit, int offset16) { + offset16 += start; + if (offset16 < start || offset16 >= limit) { + throw new ArrayIndexOutOfBoundsException(offset16); + } + char ch = source[offset16]; + if (isSurrogate(ch)) { + if (isLeadSurrogate(ch)) { + ++offset16; + if (offset16 < limit && isTrailSurrogate(source[offset16])) { + return LEAD_SURROGATE_BOUNDARY; + } + } else { // isTrailSurrogate(ch), so + --offset16; + if (offset16 >= start && isLeadSurrogate(source[offset16])) { + return TRAIL_SURROGATE_BOUNDARY; + } + } + } + return SINGLE_CHAR_BOUNDARY; + } + + /** + * Determines whether the code value is a surrogate. + * + * @param char16 The input character. + * @return true If the input character is a surrogate. + * @stable ICU 2.1 + */ + public static boolean isSurrogate(char char16) { + return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; + } + + /** + * Determines whether the character is a trail surrogate. + * + * @param char16 The input character. + * @return true If the input character is a trail surrogate. + * @stable ICU 2.1 + */ + public static boolean isTrailSurrogate(char char16) { + return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; + } + + /** + * Determines whether the character is a lead surrogate. + * + * @param char16 The input character. + * @return true If the input character is a lead surrogate + * @stable ICU 2.1 + */ + public static boolean isLeadSurrogate(char char16) { + return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; + } + + /** + * Returns the lead surrogate. If a validity check is required, use + * isLegal() on char32 + * before calling. + * + * @param char32 The input character. + * @return lead surrogate if the getCharCount(ch) is 2;
+ * and 0 otherwise (note: 0 is not a valid lead surrogate). + * @stable ICU 2.1 + */ + public static char getLeadSurrogate(int char32) { + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); + } + return 0; + } + + /** + * Returns the trail surrogate. If a validity check is required, use + * isLegal() on char32 + * before calling. + * + * @param char32 The input character. + * @return the trail surrogate if the getCharCount(ch) is 2;
+ * otherwise the character itself + * @stable ICU 2.1 + */ + public static char getTrailSurrogate(int char32) { + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); + } + return (char) char32; + } + + /** + * Convenience method corresponding to String.valueOf(char). Returns a one or two char string + * containing the UTF-32 value in UTF16 format. If a validity check is required, use isLegal() on char32 before calling. + * + * @param char32 The input character. + * @return string value of char32 in UTF16 format + * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. + * @stable ICU 2.1 + */ + public static String valueOf(int char32) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint"); + } + return toString(char32); + } + + /** + * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or + * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate + * character, the whole supplementary codepoint will be returned. If a validity check is + * required, use isLegal() on the + * codepoint at offset16 before calling. The result returned will be a newly created String + * obtained by calling source.substring(..) with the appropriate indexes. + * + * @param source The input string. + * @param offset16 The UTF16 index to the codepoint in source + * @return string value of char32 in UTF16 format + * @stable ICU 2.1 + */ + public static String valueOf(String source, int offset16) { + switch (bounds(source, offset16)) { + case LEAD_SURROGATE_BOUNDARY: + return source.substring(offset16, offset16 + 2); + case TRAIL_SURROGATE_BOUNDARY: + return source.substring(offset16 - 1, offset16 + 1); + default: + return source.substring(offset16, offset16 + 1); + } + } + + /** + * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a + * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a + * surrogate character, the whole supplementary codepoint will be returned. If a validity check + * is required, use isLegal() on + * the codepoint at offset16 before calling. The result returned will be a newly created String + * obtained by calling source.substring(..) with the appropriate indexes. + * + * @param source The input string buffer. + * @param offset16 The UTF16 index to the codepoint in source + * @return string value of char32 in UTF16 format + * @stable ICU 2.1 + */ + public static String valueOf(StringBuffer source, int offset16) { + switch (bounds(source, offset16)) { + case LEAD_SURROGATE_BOUNDARY: + return source.substring(offset16, offset16 + 2); + case TRAIL_SURROGATE_BOUNDARY: + return source.substring(offset16 - 1, offset16 + 1); + default: + return source.substring(offset16, offset16 + 1); + } + } + + /** + * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 + * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be + * returned, except when either the leading or trailing surrogate character lies out of the + * specified subarray. In the latter case, only the surrogate character within bounds will be + * returned. If a validity check is required, use isLegal() on the codepoint at + * offset16 before calling. The result returned will be a newly created String containing the + * relevant characters. + * + * @param source The input char array. + * @param start Start index of the subarray + * @param limit End index of the subarray + * @param offset16 The UTF16 index to the codepoint in source relative to start + * @return string value of char32 in UTF16 format + * @stable ICU 2.1 + */ + public static String valueOf(char source[], int start, int limit, int offset16) { + switch (bounds(source, start, limit, offset16)) { + case LEAD_SURROGATE_BOUNDARY: + return new String(source, start + offset16, 2); + case TRAIL_SURROGATE_BOUNDARY: + return new String(source, start + offset16 - 1, 2); + } + return new String(source, start + offset16, 1); + } + + /** + * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See + * the class description for notes on roundtripping. + * + * @param source The UTF-16 string + * @param offset32 UTF-32 offset + * @return UTF-16 offset + * @exception IndexOutOfBoundsException If offset32 is out of bounds. + * @stable ICU 2.1 + */ + public static int findOffsetFromCodePoint(String source, int offset32) { + char ch; + int size = source.length(), result = 0, count = offset32; + if (offset32 < 0 || offset32 > size) { + throw new StringIndexOutOfBoundsException(offset32); + } + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + + count--; + result++; + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(offset32); + } + return result; + } + + /** + * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See + * the class description for notes on roundtripping. + * + * @param source The UTF-16 string buffer + * @param offset32 UTF-32 offset + * @return UTF-16 offset + * @exception IndexOutOfBoundsException If offset32 is out of bounds. + * @stable ICU 2.1 + */ + public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { + char ch; + int size = source.length(), result = 0, count = offset32; + if (offset32 < 0 || offset32 > size) { + throw new StringIndexOutOfBoundsException(offset32); + } + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + + count--; + result++; + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(offset32); + } + return result; + } + + /** + * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See + * the class description for notes on roundtripping. + * + * @param source The UTF-16 char array whose substring is to be analysed + * @param start Offset of the substring to be analysed + * @param limit Offset of the substring to be analysed + * @param offset32 UTF-32 offset relative to start + * @return UTF-16 offset relative to start + * @exception IndexOutOfBoundsException If offset32 is out of bounds. + * @stable ICU 2.1 + */ + public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { + char ch; + int result = start, count = offset32; + if (offset32 > limit - start) { + throw new ArrayIndexOutOfBoundsException(offset32); + } + while (result < limit && count > 0) { + ch = source[result]; + if (isLeadSurrogate(ch) && ((result + 1) < limit) + && isTrailSurrogate(source[result + 1])) { + result++; + } + + count--; + result++; + } + if (count != 0) { + throw new ArrayIndexOutOfBoundsException(offset32); + } + return result - start; + } + + /** + * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given + * UTF-16 offset. Used for random access. See the class description for + * notes on roundtripping.
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset + * of the lead of the pair is returned. + *

+ * To find the UTF-32 length of a string, use: + * + *

+     * len32 = countCodePoint(source, source.length());
+     * 
+ * + *

+ *

+ * + * @param source Text to analyse + * @param offset16 UTF-16 offset < source text length. + * @return UTF-32 offset + * @exception IndexOutOfBoundsException If offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int findCodePointOffset(String source, int offset16) { + if (offset16 < 0 || offset16 > source.length()) { + throw new StringIndexOutOfBoundsException(offset16); + } + + int result = 0; + char ch; + boolean hadLeadSurrogate = false; + + for (int i = 0; i < offset16; ++i) { + ch = source.charAt(i); + if (hadLeadSurrogate && isTrailSurrogate(ch)) { + hadLeadSurrogate = false; // count valid trail as zero + } else { + hadLeadSurrogate = isLeadSurrogate(ch); + ++result; // count others as 1 + } + } + + if (offset16 == source.length()) { + return result; + } + + // end of source being the less significant surrogate character + // shift result back to the start of the supplementary character + if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { + result--; + } + + return result; + } + + /** + * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 + * offset. Used for random access. See the class description for notes on + * roundtripping.
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset + * of the lead of the pair is returned. + *

+ * To find the UTF-32 length of a string, use: + * + *

+     * len32 = countCodePoint(source);
+     * 
+ * + *

+ *

+ * + * @param source Text to analyse + * @param offset16 UTF-16 offset < source text length. + * @return UTF-32 offset + * @exception IndexOutOfBoundsException If offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int findCodePointOffset(StringBuffer source, int offset16) { + if (offset16 < 0 || offset16 > source.length()) { + throw new StringIndexOutOfBoundsException(offset16); + } + + int result = 0; + char ch; + boolean hadLeadSurrogate = false; + + for (int i = 0; i < offset16; ++i) { + ch = source.charAt(i); + if (hadLeadSurrogate && isTrailSurrogate(ch)) { + hadLeadSurrogate = false; // count valid trail as zero + } else { + hadLeadSurrogate = isLeadSurrogate(ch); + ++result; // count others as 1 + } + } + + if (offset16 == source.length()) { + return result; + } + + // end of source being the less significant surrogate character + // shift result back to the start of the supplementary character + if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { + result--; + } + + return result; + } + + /** + * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 + * offset. Used for random access. See the class description for notes on + * roundtripping.
+ * Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset + * of the lead of the pair is returned. + *

+ * To find the UTF-32 length of a substring, use: + * + *

+     * len32 = countCodePoint(source, start, limit);
+     * 
+ * + *

+ *

+ * + * @param source Text to analyse + * @param start Offset of the substring + * @param limit Offset of the substring + * @param offset16 UTF-16 relative to start + * @return UTF-32 offset relative to start + * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. + * @stable ICU 2.1 + */ + public static int findCodePointOffset(char source[], int start, int limit, int offset16) { + offset16 += start; + if (offset16 > limit) { + throw new StringIndexOutOfBoundsException(offset16); + } + + int result = 0; + char ch; + boolean hadLeadSurrogate = false; + + for (int i = start; i < offset16; ++i) { + ch = source[i]; + if (hadLeadSurrogate && isTrailSurrogate(ch)) { + hadLeadSurrogate = false; // count valid trail as zero + } else { + hadLeadSurrogate = isLeadSurrogate(ch); + ++result; // count others as 1 + } + } + + if (offset16 == limit) { + return result; + } + + // end of source being the less significant surrogate character + // shift result back to the start of the supplementary character + if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { + result--; + } + + return result; + } + + /** + * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, + * use isLegal() on char32 before + * calling. + * + * @param target The buffer to append to + * @param char32 Value to append. + * @return the updated StringBuffer + * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints + * @stable ICU 2.1 + */ + public static StringBuffer append(StringBuffer target, int char32) { + // Check for irregular values + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); + } + + // Write the UTF-16 values + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + target.append(getLeadSurrogate(char32)); + target.append(getTrailSurrogate(char32)); + } else { + target.append((char) char32); + } + return target; + } + + /** + * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a + * convenience. + * + * @param target The buffer to append to + * @param cp The code point to append + * @return the updated StringBuffer + * @throws IllegalArgumentException If cp is not a valid code point + * @stable ICU 3.0 + */ + public static StringBuffer appendCodePoint(StringBuffer target, int cp) { + return append(target, cp); + } + + /** + * Adds a codepoint to offset16 position of the argument char array. + * + * @param target Char array to be append with the new code point + * @param limit UTF16 offset which the codepoint will be appended. + * @param char32 Code point to be appended + * @return offset after char32 in the array. + * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not + * lie within the range of the Unicode codepoints. + * @stable ICU 2.1 + */ + public static int append(char[] target, int limit, int char32) { + // Check for irregular values + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint"); + } + // Write the UTF-16 values + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + target[limit++] = getLeadSurrogate(char32); + target[limit++] = getTrailSurrogate(char32); + } else { + target[limit++] = (char) char32; + } + return limit; + } + + /** + * Number of codepoints in a UTF16 String + * + * @param source UTF16 string + * @return number of codepoint in string + * @stable ICU 2.1 + */ + public static int countCodePoint(String source) { + if (source == null || source.length() == 0) { + return 0; + } + return findCodePointOffset(source, source.length()); + } + + /** + * Number of codepoints in a UTF16 String buffer + * + * @param source UTF16 string buffer + * @return number of codepoint in string + * @stable ICU 2.1 + */ + public static int countCodePoint(StringBuffer source) { + if (source == null || source.length() == 0) { + return 0; + } + return findCodePointOffset(source, source.length()); + } + + /** + * Number of codepoints in a UTF16 char array substring + * + * @param source UTF16 char array + * @param start Offset of the substring + * @param limit Offset of the substring + * @return number of codepoint in the substring + * @exception IndexOutOfBoundsException If start and limit are not valid. + * @stable ICU 2.1 + */ + public static int countCodePoint(char source[], int start, int limit) { + if (source == null || source.length == 0) { + return 0; + } + return findCodePointOffset(source, start, limit, limit - start); + } + + /** + * Set a code point into a UTF16 position. Adjusts target according if we are replacing a + * non-supplementary codepoint with a supplementary and vice versa. + * + * @param target Stringbuffer + * @param offset16 UTF16 position to insert into + * @param char32 Code point + * @stable ICU 2.1 + */ + public static void setCharAt(StringBuffer target, int offset16, int char32) { + int count = 1; + char single = target.charAt(offset16); + + if (isSurrogate(single)) { + // pairs of the surrogate with offset16 at the lead char found + if (isLeadSurrogate(single) && (target.length() > offset16 + 1) + && isTrailSurrogate(target.charAt(offset16 + 1))) { + count++; + } else { + // pairs of the surrogate with offset16 at the trail char + // found + if (isTrailSurrogate(single) && (offset16 > 0) + && isLeadSurrogate(target.charAt(offset16 - 1))) { + offset16--; + count++; + } + } + } + target.replace(offset16, offset16 + count, valueOf(char32)); + } + + /** + * Set a code point into a UTF16 position in a char array. Adjusts target according if we are + * replacing a non-supplementary codepoint with a supplementary and vice versa. + * + * @param target char array + * @param limit numbers of valid chars in target, different from target.length. limit counts the + * number of chars in target that represents a string, not the size of array target. + * @param offset16 UTF16 position to insert into + * @param char32 code point + * @return new number of chars in target that represents a string + * @exception IndexOutOfBoundsException if offset16 is out of range + * @stable ICU 2.1 + */ + public static int setCharAt(char target[], int limit, int offset16, int char32) { + if (offset16 >= limit) { + throw new ArrayIndexOutOfBoundsException(offset16); + } + int count = 1; + char single = target[offset16]; + + if (isSurrogate(single)) { + // pairs of the surrogate with offset16 at the lead char found + if (isLeadSurrogate(single) && (target.length > offset16 + 1) + && isTrailSurrogate(target[offset16 + 1])) { + count++; + } else { + // pairs of the surrogate with offset16 at the trail char + // found + if (isTrailSurrogate(single) && (offset16 > 0) + && isLeadSurrogate(target[offset16 - 1])) { + offset16--; + count++; + } + } + } + + String str = valueOf(char32); + int result = limit; + int strlength = str.length(); + target[offset16] = str.charAt(0); + if (count == strlength) { + if (count == 2) { + target[offset16 + 1] = str.charAt(1); + } + } else { + // this is not exact match in space, we'll have to do some + // shifting + System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit + - (offset16 + count)); + if (count < strlength) { + // char32 is a supplementary character trying to squeeze into + // a non-supplementary space + target[offset16 + 1] = str.charAt(1); + result++; + if (result < target.length) { + target[result] = 0; + } + } else { + // char32 is a non-supplementary character trying to fill + // into a supplementary space + result--; + target[result] = 0; + } + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints + * + * @param source string + * @param offset16 UTF16 position to shift + * @param shift32 number of codepoints to shift + * @return new shifted offset16 + * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(String source, int offset16, int shift32) { + int result = offset16; + int size = source.length(); + int count; + char ch; + if (offset16 < 0 || offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + count = shift32; + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + count--; + result++; + } + } else { + if (offset16 + shift32 < 0) { + throw new StringIndexOutOfBoundsException(offset16); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < 0) { + break; + } + ch = source.charAt(result); + if (isTrailSurrogate(ch) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints + * + * @param source String buffer + * @param offset16 UTF16 position to shift + * @param shift32 Number of codepoints to shift + * @return new shifted offset16 + * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { + int result = offset16; + int size = source.length(); + int count; + char ch; + if (offset16 < 0 || offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + offset16 > size) { + throw new StringIndexOutOfBoundsException(offset16); + } + count = shift32; + while (result < size && count > 0) { + ch = source.charAt(result); + if (isLeadSurrogate(ch) && ((result + 1) < size) + && isTrailSurrogate(source.charAt(result + 1))) { + result++; + } + count--; + result++; + } + } else { + if (offset16 + shift32 < 0) { + throw new StringIndexOutOfBoundsException(offset16); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < 0) { + break; + } + ch = source.charAt(result); + if (isTrailSurrogate(ch) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + return result; + } + + /** + * Shifts offset16 by the argument number of codepoints within a subarray. + * + * @param source Char array + * @param start Position of the subarray to be performed on + * @param limit Position of the subarray to be performed on + * @param offset16 UTF16 position to shift relative to start + * @param shift32 Number of codepoints to shift + * @return new shifted offset16 relative to start + * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the + * subarray bounds are out of range. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(char source[], int start, int limit, int offset16, + int shift32) { + int size = source.length; + int count; + char ch; + int result = offset16 + start; + if (start < 0 || limit < start) { + throw new StringIndexOutOfBoundsException(start); + } + if (limit > size) { + throw new StringIndexOutOfBoundsException(limit); + } + if (offset16 < 0 || result > limit) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + result > size) { + throw new StringIndexOutOfBoundsException(result); + } + count = shift32; + while (result < limit && count > 0) { + ch = source[result]; + if (isLeadSurrogate(ch) && (result + 1 < limit) + && isTrailSurrogate(source[result + 1])) { + result++; + } + count--; + result++; + } + } else { + if (result + shift32 < start) { + throw new StringIndexOutOfBoundsException(result); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < start) { + break; + } + ch = source[result]; + if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + result -= start; + return result; + } + + /** + * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the + * middle of a supplementary codepoint, char32 will be inserted after the supplementary + * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 + * otherwise. + *

+ * The overall effect is exactly as if the argument were converted to a string by the method + * valueOf(char) and the characters in that string were then inserted into target at the + * position indicated by offset16. + *

+ *

+ * The offset argument must be greater than or equal to 0, and less than or equal to the length + * of source. + * + * @param target String buffer to insert to + * @param offset16 Offset which char32 will be inserted in + * @param char32 Codepoint to be inserted + * @return a reference to target + * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. + * @stable ICU 2.1 + */ + public static StringBuffer insert(StringBuffer target, int offset16, int char32) { + String str = valueOf(char32); + if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { + offset16++; + } + target.insert(offset16, str); + return target; + } + + /** + * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the + * middle of a supplementary codepoint, char32 will be inserted after the supplementary + * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. + *

+ * The overall effect is exactly as if the argument were converted to a string by the method + * valueOf(char) and the characters in that string were then inserted into target at the + * position indicated by offset16. + *

+ *

+ * The offset argument must be greater than or equal to 0, and less than or equal to the limit. + * + * @param target Char array to insert to + * @param limit End index of the char array, limit <= target.length + * @param offset16 Offset which char32 will be inserted in + * @param char32 Codepoint to be inserted + * @return new limit size + * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. + * @stable ICU 2.1 + */ + public static int insert(char target[], int limit, int offset16, int char32) { + String str = valueOf(char32); + if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { + offset16++; + } + int size = str.length(); + if (limit + size > target.length) { + throw new ArrayIndexOutOfBoundsException(offset16 + size); + } + System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); + target[offset16] = str.charAt(0); + if (size == 2) { + target[offset16 + 1] = str.charAt(1); + } + return limit + size; + } + + /** + * Removes the codepoint at the specified position in this target (shortening target by 1 + * character if the codepoint is a non-supplementary, 2 otherwise). + * + * @param target String buffer to remove codepoint from + * @param offset16 Offset which the codepoint will be removed + * @return a reference to target + * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. + * @stable ICU 2.1 + */ + public static StringBuffer delete(StringBuffer target, int offset16) { + int count = 1; + switch (bounds(target, offset16)) { + case LEAD_SURROGATE_BOUNDARY: + count++; + break; + case TRAIL_SURROGATE_BOUNDARY: + count++; + offset16--; + break; + } + target.delete(offset16, offset16 + count); + return target; + } + + /** + * Removes the codepoint at the specified position in this target (shortening target by 1 + * character if the codepoint is a non-supplementary, 2 otherwise). + * + * @param target String buffer to remove codepoint from + * @param limit End index of the char array, limit <= target.length + * @param offset16 Offset which the codepoint will be removed + * @return a new limit size + * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. + * @stable ICU 2.1 + */ + public static int delete(char target[], int limit, int offset16) { + int count = 1; + switch (bounds(target, 0, limit, offset16)) { + case LEAD_SURROGATE_BOUNDARY: + count++; + break; + case TRAIL_SURROGATE_BOUNDARY: + count++; + offset16--; + break; + } + System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); + target[limit - count] = 0; + return limit - count; + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the first occurrence of + * the argument codepoint. I.e., the smallest index i such that + * UTF16.charAt(source, i) == + * char32 is true. + *

+ * If no such character occurs in this string, then -1 is returned. + *

+ *

+ * Examples:
+ * UTF16.indexOf("abc", 'a') returns 0
+ * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param char32 Codepoint to search for + * @return the index of the first occurrence of the codepoint in the argument Unicode string, or + * -1 if the codepoint does not occur. + * @stable ICU 2.6 + */ + public static int indexOf(String source, int char32) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); + } + // non-surrogate bmp + if (char32 < LEAD_SURROGATE_MIN_VALUE + || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { + return source.indexOf((char) char32); + } + // surrogate + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + int result = source.indexOf((char) char32); + if (result >= 0) { + if (isLeadSurrogate((char) char32) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + 1))) { + return indexOf(source, char32, result + 1); + } + // trail surrogate + if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { + return indexOf(source, char32, result + 1); + } + } + return result; + } + // supplementary + String char32str = toString(char32); + return source.indexOf(char32str); + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the first occurrence of + * the argument string str. This method is implemented based on codepoints, hence a "lead + * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str + * starts with trail surrogate character at index 0, a source with a leading a surrogate + * character before str found at in source will not have a valid match. Vice versa for lead + * surrogates that ends str. See example below. + *

+ * If no such string str occurs in this source, then -1 is returned. + *

+ *

+ * Examples:
+ * UTF16.indexOf("abc", "ab") returns 0
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @return the index of the first occurrence of the codepoint in the argument Unicode string, or + * -1 if the codepoint does not occur. + * @stable ICU 2.6 + */ + public static int indexOf(String source, String str) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.indexOf(str); + } + + int result = source.indexOf(str); + int resultEnd = result + strLength; + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(resultEnd + 1))) { + return indexOf(source, str, resultEnd + 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return indexOf(source, str, resultEnd + 1); + } + } + return result; + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the first occurrence of + * the argument codepoint. I.e., the smallest index i such that:
+ * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. + *

+ * If no such character occurs in this string, then -1 is returned. + *

+ *

+ * Examples:
+ * UTF16.indexOf("abc", 'a', 1) returns -1
+ * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param char32 Codepoint to search for + * @param fromIndex The index to start the search from. + * @return the index of the first occurrence of the codepoint in the argument Unicode string at + * or after fromIndex, or -1 if the codepoint does not occur. + * @stable ICU 2.6 + */ + public static int indexOf(String source, int char32, int fromIndex) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); + } + // non-surrogate bmp + if (char32 < LEAD_SURROGATE_MIN_VALUE + || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { + return source.indexOf((char) char32, fromIndex); + } + // surrogate + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + int result = source.indexOf((char) char32, fromIndex); + if (result >= 0) { + if (isLeadSurrogate((char) char32) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + 1))) { + return indexOf(source, char32, result + 1); + } + // trail surrogate + if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { + return indexOf(source, char32, result + 1); + } + } + return result; + } + // supplementary + String char32str = toString(char32); + return source.indexOf(char32str, fromIndex); + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the first occurrence of + * the argument string str. This method is implemented based on codepoints, hence a "lead + * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str + * starts with trail surrogate character at index 0, a source with a leading a surrogate + * character before str found at in source will not have a valid match. Vice versa for lead + * surrogates that ends str. See example below. + *

+ * If no such string str occurs in this source, then -1 is returned. + *

+ *

+ * Examples:
+ * UTF16.indexOf("abc", "ab", 0) returns 0
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3
+ * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @param fromIndex The index to start the search from. + * @return the index of the first occurrence of the codepoint in the argument Unicode string, or + * -1 if the codepoint does not occur. + * @stable ICU 2.6 + */ + public static int indexOf(String source, String str, int fromIndex) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.indexOf(str, fromIndex); + } + + int result = source.indexOf(str, fromIndex); + int resultEnd = result + strLength; + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(resultEnd))) { + return indexOf(source, str, resultEnd + 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return indexOf(source, str, resultEnd + 1); + } + } + return result; + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument codepoint. I.e., the index returned is the largest value i such that: + * UTF16.charAt(source, i) == char32 is true. + *

+ * Examples:
+ * UTF16.lastIndexOf("abc", 'a') returns 0
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1
+ *

+ *

+ * source is searched backwards starting at the last character. + *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param char32 Codepoint to search for + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, int char32) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); + } + // non-surrogate bmp + if (char32 < LEAD_SURROGATE_MIN_VALUE + || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { + return source.lastIndexOf((char) char32); + } + // surrogate + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + int result = source.lastIndexOf((char) char32); + if (result >= 0) { + if (isLeadSurrogate((char) char32) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + 1))) { + return lastIndexOf(source, char32, result - 1); + } + // trail surrogate + if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, char32, result - 1); + } + } + return result; + } + // supplementary + String char32str = toString(char32); + return source.lastIndexOf(char32str); + } + + /** + * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument string str. This method is implemented based on codepoints, hence a "lead + * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str + * starts with trail surrogate character at index 0, a source with a leading a surrogate + * character before str found at in source will not have a valid match. Vice versa for lead + * surrogates that ends str. See example below. + *

+ * Examples:
+ * UTF16.lastIndexOf("abc", "a") returns 0
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1
+ *

+ *

+ * source is searched backwards starting at the last character. + *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, String str) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.lastIndexOf(str); + } + + int result = source.lastIndexOf(str); + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + strLength + 1))) { + return lastIndexOf(source, str, result - 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, str, result - 1); + } + } + return result; + } + + /** + *

+ * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument codepoint, where the result is less than or equals to fromIndex. + *

+ *

+ * This method is implemented based on codepoints, hence a single surrogate character will not + * match a supplementary character. + *

+ *

+ * source is searched backwards starting at the last character starting at the specified index. + *

+ *

+ * Examples:
+ * UTF16.lastIndexOf("abc", 'c', 2) returns 2
+ * UTF16.lastIndexOf("abc", 'c', 1) returns -1
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param char32 Codepoint to search for + * @param fromIndex the index to start the search from. There is no restriction on the value of + * fromIndex. If it is greater than or equal to the length of this string, it has the + * same effect as if it were equal to one less than the length of this string: this + * entire string may be searched. If it is negative, it has the same effect as if it + * were -1: -1 is returned. + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, int char32, int fromIndex) { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); + } + // non-surrogate bmp + if (char32 < LEAD_SURROGATE_MIN_VALUE + || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { + return source.lastIndexOf((char) char32, fromIndex); + } + // surrogate + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + int result = source.lastIndexOf((char) char32, fromIndex); + if (result >= 0) { + if (isLeadSurrogate((char) char32) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + 1))) { + return lastIndexOf(source, char32, result - 1); + } + // trail surrogate + if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, char32, result - 1); + } + } + return result; + } + // supplementary + String char32str = toString(char32); + return source.lastIndexOf(char32str, fromIndex); + } + + /** + *

+ * Returns the index within the argument UTF16 format Unicode string of the last occurrence of + * the argument string str, where the result is less than or equals to fromIndex. + *

+ *

+ * This method is implemented based on codepoints, hence a "lead surrogate character + trail + * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate + * character at index 0, a source with a leading a surrogate character before str found at in + * source will not have a valid match. Vice versa for lead surrogates that ends str. + *

+ * See example below. + *

+ * Examples:
+ * UTF16.lastIndexOf("abc", "c", 2) returns 2
+ * UTF16.lastIndexOf("abc", "c", 1) returns -1
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3
+ * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1
+ *

+ *

+ * source is searched backwards starting at the last character. + *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string that will be searched + * @param str UTF16 format Unicode string to search for + * @param fromIndex the index to start the search from. There is no restriction on the value of + * fromIndex. If it is greater than or equal to the length of this string, it has the + * same effect as if it were equal to one less than the length of this string: this + * entire string may be searched. If it is negative, it has the same effect as if it + * were -1: -1 is returned. + * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint + * does not occur. + * @stable ICU 2.6 + */ + public static int lastIndexOf(String source, String str, int fromIndex) { + int strLength = str.length(); + // non-surrogate ends + if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { + return source.lastIndexOf(str, fromIndex); + } + + int result = source.lastIndexOf(str, fromIndex); + if (result >= 0) { + // check last character + if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) + && isTrailSurrogate(source.charAt(result + strLength))) { + return lastIndexOf(source, str, result - 1); + } + // check first character which is a trail surrogate + if (isTrailSurrogate(str.charAt(0)) && result > 0 + && isLeadSurrogate(source.charAt(result - 1))) { + return lastIndexOf(source, str, result - 1); + } + } + return result; + } + + /** + * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of + * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 + * format Unicode string source, then source will be returned. Otherwise, a new String object is + * created that represents a codepoint sequence identical to the codepoint sequence represented + * by source, except that every occurrence of oldChar32 is replaced by an occurrence of + * newChar32. + *

+ * Examples:
+ * UTF16.replace("mesquite in your cellar", 'e', 'o');
+ * returns "mosquito in your collar"
+ * UTF16.replace("JonL", 'q', 'x');
+ * returns "JonL" (no change)
+ * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!');
+ * returns "Supplementary character !"
+ * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!');
+ * returns "Supplementary character \ud800\udc00"
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string which the codepoint replacements will be based on. + * @param oldChar32 Non-zero old codepoint to be replaced. + * @param newChar32 The new codepoint to replace oldChar32 + * @return new String derived from source by replacing every occurrence of oldChar32 with + * newChar32, unless when no oldChar32 is found in source then source will be returned. + * @stable ICU 2.6 + */ + public static String replace(String source, int oldChar32, int newChar32) { + if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); + } + if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); + } + + int index = indexOf(source, oldChar32); + if (index == -1) { + return source; + } + String newChar32Str = toString(newChar32); + int oldChar32Size = 1; + int newChar32Size = newChar32Str.length(); + StringBuffer result = new StringBuffer(source); + int resultIndex = index; + + if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { + oldChar32Size = 2; + } + + while (index != -1) { + int endResultIndex = resultIndex + oldChar32Size; + result.replace(resultIndex, endResultIndex, newChar32Str); + int lastEndIndex = index + oldChar32Size; + index = indexOf(source, oldChar32, lastEndIndex); + resultIndex += newChar32Size + index - lastEndIndex; + } + return result.toString(); + } + + /** + * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr + * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string + * source, then source will be returned. Otherwise, a new String object is created that + * represents a codepoint sequence identical to the codepoint sequence represented by source, + * except that every occurrence of oldStr is replaced by an occurrence of newStr. + *

+ * Examples:
+ * UTF16.replace("mesquite in your cellar", "e", "o");
+ * returns "mosquito in your collar"
+ * UTF16.replace("mesquite in your cellar", "mesquite", "cat");
+ * returns "cat in your cellar"
+ * UTF16.replace("JonL", "q", "x");
+ * returns "JonL" (no change)
+ * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!');
+ * returns "Supplementary character !"
+ * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!');
+ * returns "Supplementary character \ud800\udc00"
+ *

+ * Note this method is provided as support to jdk 1.3, which does not support supplementary + * characters to its fullest. + * + * @param source UTF16 format Unicode string which the replacements will be based on. + * @param oldStr Non-zero-length string to be replaced. + * @param newStr The new string to replace oldStr + * @return new String derived from source by replacing every occurrence of oldStr with newStr. + * When no oldStr is found in source, then source will be returned. + * @stable ICU 2.6 + */ + public static String replace(String source, String oldStr, String newStr) { + int index = indexOf(source, oldStr); + if (index == -1) { + return source; + } + int oldStrSize = oldStr.length(); + int newStrSize = newStr.length(); + StringBuffer result = new StringBuffer(source); + int resultIndex = index; + + while (index != -1) { + int endResultIndex = resultIndex + oldStrSize; + result.replace(resultIndex, endResultIndex, newStr); + int lastEndIndex = index + oldStrSize; + index = indexOf(source, oldStr, lastEndIndex); + resultIndex += newStrSize + index - lastEndIndex; + } + return result.toString(); + } + + /** + * Reverses a UTF16 format Unicode string and replaces source's content with it. This method + * will reverse surrogate characters correctly, instead of blindly reversing every character. + *

+ * Examples:
+ * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))
+ * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". + * + * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed + * @return a modified source with reversed UTF16 format Unicode string. + * @stable ICU 2.6 + */ + public static StringBuffer reverse(StringBuffer source) { + int length = source.length(); + StringBuffer result = new StringBuffer(length); + for (int i = length; i-- > 0;) { + char ch = source.charAt(i); + if (isTrailSurrogate(ch) && i > 0) { + char ch2 = source.charAt(i - 1); + if (isLeadSurrogate(ch2)) { + result.append(ch2); + result.append(ch); + --i; + continue; + } + } + result.append(ch); + } + return result; + } + + /** + * Check if the string contains more Unicode code points than a certain number. This is more + * efficient than counting all code points in the entire string and comparing that number with a + * threshold. This function may not need to scan the string at all if the length is within a + * certain range, and never needs to count more than 'number + 1' code points. Logically + * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two + * code units. + * + * @param source The input string. + * @param number The number of code points in the string is compared against the 'number' + * parameter. + * @return boolean value for whether the string contains more Unicode code points than 'number'. + * @stable ICU 2.4 + */ + public static boolean hasMoreCodePointsThan(String source, int number) { + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + int length = source.length(); + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + int start = 0; + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source.charAt(start++)) && start != length + && isTrailSurrogate(source.charAt(start))) { + start++; + if (--maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + --number; + } + } + + /** + * Check if the sub-range of char array, from argument start to limit, contains more Unicode + * code points than a certain number. This is more efficient than counting all code points in + * the entire char array range and comparing that number with a threshold. This function may not + * need to scan the char array at all if start and limit is within a certain range, and never + * needs to count more than 'number + 1' code points. Logically equivalent to + * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one + * or two code units. + * + * @param source Array of UTF-16 chars + * @param start Offset to substring in the source array for analyzing + * @param limit Offset to substring in the source array for analyzing + * @param number The number of code points in the string is compared against the 'number' + * parameter. + * @return boolean value for whether the string contains more Unicode code points than 'number'. + * @exception IndexOutOfBoundsException Thrown when limit < start + * @stable ICU 2.4 + */ + public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { + int length = limit - start; + if (length < 0 || start < 0 || limit < 0) { + throw new IndexOutOfBoundsException( + "Start and limit indexes should be non-negative and start <= limit"); + } + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source[start++]) && start != limit + && isTrailSurrogate(source[start])) { + start++; + if (--maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + --number; + } + } + + /** + * Check if the string buffer contains more Unicode code points than a certain number. This is + * more efficient than counting all code points in the entire string buffer and comparing that + * number with a threshold. This function may not need to scan the string buffer at all if the + * length is within a certain range, and never needs to count more than 'number + 1' code + * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy + * either one or two code units. + * + * @param source The input string buffer. + * @param number The number of code points in the string buffer is compared against the 'number' + * parameter. + * @return boolean value for whether the string buffer contains more Unicode code points than + * 'number'. + * @stable ICU 2.4 + */ + public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { + if (number < 0) { + return true; + } + if (source == null) { + return false; + } + int length = source.length(); + + // length >= 0 known + // source contains at least (length + 1) / 2 code points: <= 2 + // chars per cp + if (((length + 1) >> 1) > number) { + return true; + } + + // check if source does not even contain enough chars + int maxsupplementary = length - number; + if (maxsupplementary <= 0) { + return false; + } + + // there are maxsupplementary = length - number more chars than + // asked-for code points + + // count code points until they exceed and also check that there are + // no more than maxsupplementary supplementary code points (char pairs) + int start = 0; + while (true) { + if (length == 0) { + return false; + } + if (number == 0) { + return true; + } + if (isLeadSurrogate(source.charAt(start++)) && start != length + && isTrailSurrogate(source.charAt(start))) { + start++; + if (--maxsupplementary <= 0) { + // too many pairs - too few code points + return false; + } + } + --number; + } + } + + /** + * Cover JDK 1.5 API. Create a String from an array of codePoints. + * + * @param codePoints The code array + * @param offset The start of the text in the code point array + * @param count The number of code points + * @return a String representing the code points between offset and count + * @throws IllegalArgumentException If an invalid code point is encountered + * @throws IndexOutOfBoundsException If the offset or count are out of bounds. + * @stable ICU 3.0 + */ + public static String newString(int[] codePoints, int offset, int count) { + if (count < 0) { + throw new IllegalArgumentException(); + } + char[] chars = new char[count]; + int w = 0; + for (int r = offset, e = offset + count; r < e; ++r) { + int cp = codePoints[r]; + if (cp < 0 || cp > 0x10ffff) { + throw new IllegalArgumentException(); + } + while (true) { + try { + if (cp < 0x010000) { + chars[w] = (char) cp; + w++; + } else { + chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); + chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); + w += 2; + } + break; + } catch (IndexOutOfBoundsException ex) { + int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) + / (r - offset + 1))); + char[] temp = new char[newlen]; + System.arraycopy(chars, 0, temp, 0, w); + chars = temp; + } + } + } + return new String(chars, 0, w); + } + + // private data members ------------------------------------------------- + + /** + * Shift value for lead surrogate to form a supplementary character. + */ + private static final int LEAD_SURROGATE_SHIFT_ = 10; + + /** + * Mask to retrieve the significant value from a trail surrogate. + */ + private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; + + /** + * Value that all lead surrogate starts with + */ + private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE + - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + + // private methods ------------------------------------------------------ + + /** + *

+ * Converts argument code point and returns a String object representing the code point's value + * in UTF16 format. + *

+ *

+ * This method does not check for the validity of the codepoint, the results are not guaranteed + * if a invalid codepoint is passed as argument. + *

+ *

+ * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. + *

+ * + * @param ch + * code point + * @return string representation of the code point + */ + private static String toString(int ch) { + if (ch < SUPPLEMENTARY_MIN_VALUE) { + return String.valueOf((char) ch); + } + + StringBuilder result = new StringBuilder(); + result.append(getLeadSurrogate(ch)); + result.append(getTrailSurrogate(ch)); + return result.toString(); + } +} +// eof Property changes on: lucene\src\java\com\ibm\icu\charset\UTF16.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/UConverterAlias.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/UConverterAlias.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/UConverterAlias.java (revision 0) @@ -0,0 +1,769 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2010, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.charset; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; + +final class UConverterAlias { + static final int UNNORMALIZED = 0; + + static final int STD_NORMALIZED = 1; + + static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000; + + static final int CONTAINS_OPTION_BIT = 0x4000; + + static final int CONVERTER_INDEX_MASK = 0xFFF; + + static final int NUM_RESERVED_TAGS = 2; + + static final int NUM_HIDDEN_TAGS = 1; + + static int[] gConverterList = null; + + static int[] gTagList = null; + + static int[] gAliasList = null; + + static int[] gUntaggedConvArray = null; + + static int[] gTaggedAliasArray = null; + + static int[] gTaggedAliasLists = null; + + static int[] gOptionTable = null; + + static byte[] gStringTable = null; + + static byte[] gNormalizedStringTable = null; + + static final String GET_STRING(int idx) { + return new String(gStringTable, 2 * idx, strlen(gStringTable, 2 * idx)); + } + + private static final String GET_NORMALIZED_STRING(int idx) { + return new String(gNormalizedStringTable, 2 * idx, strlen(gNormalizedStringTable, 2 * idx)); + } + + public static final int strlen(byte[] sArray, int sBegin) + { + int i = sBegin; + while(i < sArray.length && sArray[i++] != 0) {} + return i - sBegin - 1; + } + + /*private*/ static final int tocLengthIndex = 0; + + private static final int converterListIndex = 1; + + private static final int tagListIndex = 2; + + private static final int aliasListIndex = 3; + + private static final int untaggedConvArrayIndex = 4; + + private static final int taggedAliasArrayIndex = 5; + + private static final int taggedAliasListsIndex = 6; + + private static final int optionTableIndex = 7; + + private static final int stringTableIndex = 8; + + private static final int normalizedStringTableIndex = 9; + + private static final int minTocLength = 9; /* + * min. tocLength in the file, + * does not count the + * tocLengthIndex! + */ + + private static final int offsetsCount = minTocLength + 1; /* + * length of the + * swapper's + * temporary + * offsets[] + */ + + static ByteBuffer gAliasData = null; + + private static final boolean isAlias(String alias) { + if (alias == null) { + throw new IllegalArgumentException("Alias param is null!"); + } + return (alias.length() != 0); + } + + private static final String CNVALIAS_DATA_FILE_NAME = "/cnvalias.icu"; + + /** + * Default buffer size of datafile + */ + private static final int CNVALIAS_DATA_BUFFER_SIZE = 25000; + + private static final synchronized boolean haveAliasData() + throws IOException{ + return true; + } + + // U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode + // *pErrorCode) +// public static final String io_getConverterName(String alias) +// throws IOException{ +// if (haveAliasData() && isAlias(alias)) { +// boolean[] isAmbigous = new boolean[1]; +// int convNum = findConverter(alias, isAmbigous); +// if (convNum < gConverterList.length) { +// return GET_STRING(gConverterList[(int) convNum]); +// } +// /* else converter not found */ +// } +// return null; +// } + + /* + * search for an alias return the converter number index for gConverterList + */ + // static U_INLINE uint32_t findConverter(const char *alias, UErrorCode + // *pErrorCode) + private static final int findConverter(String alias, boolean[] isAmbigous) { + int mid, start, limit; + int lastMid; + int result; + StringBuilder strippedName = new StringBuilder(); + String aliasToCompare; + + stripForCompare(strippedName, alias); + alias = strippedName.toString(); + + /* do a binary search for the alias */ + start = 0; + limit = gUntaggedConvArray.length; + mid = limit; + lastMid = Integer.MAX_VALUE; + + for (;;) { + mid = (start + limit) / 2; + if (lastMid == mid) { /* Have we moved? */ + break; /* We haven't moved, and it wasn't found. */ + } + lastMid = mid; + aliasToCompare = GET_NORMALIZED_STRING(gAliasList[mid]); + result = alias.compareTo(aliasToCompare); + + if (result < 0) { + limit = mid; + } else if (result > 0) { + start = mid; + } else { + /* + * Since the gencnval tool folds duplicates into one entry, this + * alias in gAliasList is unique, but different standards may + * map an alias to different converters. + */ + if ((gUntaggedConvArray[mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) { + isAmbigous[0]=true; + } + /* State whether the canonical converter name contains an option. + This information is contained in this list in order to maintain backward & forward compatibility. */ + /*if (containsOption) { + UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo; + *containsOption = (UBool)((containsCnvOptionInfo + && ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0)) + || !containsCnvOptionInfo); + }*/ + return gUntaggedConvArray[mid] & CONVERTER_INDEX_MASK; + } + } + return Integer.MAX_VALUE; + } + + /** + * stripForCompare Remove the underscores, dashes and spaces from + * the name, and convert the name to lower case. + * + * @param dst The destination buffer, which is <= the buffer of name. + * @param name The alias to strip + * @return the destination buffer. + */ + public static final StringBuilder stripForCompare(StringBuilder dst, String name) { + return io_stripASCIIForCompare(dst, name); + } + + // enum { + private static final byte IGNORE = 0; + private static final byte ZERO = 1; + private static final byte NONZERO = 2; + static final byte MINLETTER = 3; /* any values from here on are lowercase letter mappings */ + // } + + /* character types for ASCII 00..7F */ + static final byte asciiTypes[] = new byte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0, + 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0, + 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0 + }; + + private static final char GET_CHAR_TYPE(char c) { + return (char)((c < asciiTypes.length) ? asciiTypes[c] : (char)IGNORE); + } + + /** @see UConverterAlias#compareNames */ + private static final StringBuilder io_stripASCIIForCompare(StringBuilder dst, String name) { + int nameIndex = 0; + char type, nextType; + char c1; + boolean afterDigit = false; + + while (nameIndex < name.length()) { + c1 = name.charAt(nameIndex++); + type = GET_CHAR_TYPE(c1); + switch (type) { + case IGNORE: + afterDigit = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit && nameIndex < name.length()) { + nextType = GET_CHAR_TYPE(name.charAt(nameIndex)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit = true; + break; + default: + c1 = type; /* lowercased letter */ + afterDigit = false; + break; + } + dst.append(c1); + } + return dst; + } + + /** + * Do a fuzzy compare of a two converter/alias names. The comparison is + * case-insensitive. It also ignores the characters '-', '_', and ' ' (dash, + * underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8" + * are exactly equivalent. + * + * This is a symmetrical (commutative) operation; order of arguments is + * insignificant. This is an important property for sorting the list (when + * the list is preprocessed into binary form) and for performing binary + * searches on it at run time. + * + * @param name1 + * a converter name or alias, zero-terminated + * @param name2 + * a converter name or alias, zero-terminated + * @return 0 if the names match, or a negative value if the name1 lexically + * precedes name2, or a positive value if the name1 lexically + * follows name2. + * + * @see UConverterAlias#stripForCompare + */ + static int compareNames(String name1, String name2){ + int rc, name1Index = 0, name2Index = 0; + char type, nextType; + char c1 = 0, c2 = 0; + boolean afterDigit1 = false, afterDigit2 = false; + + for (;;) { + while (name1Index < name1.length()) { + c1 = name1.charAt(name1Index++); + type = GET_CHAR_TYPE(c1); + switch (type) { + case IGNORE: + afterDigit1 = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit1 && name1Index < name1.length()) { + nextType = GET_CHAR_TYPE(name1.charAt(name1Index)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit1 = true; + break; + default: + c1 = type; /* lowercased letter */ + afterDigit1 = false; + break; + } + break; /* deliver c1 */ + } + while (name2Index < name2.length()) { + c2 = name2.charAt(name2Index++); + type = GET_CHAR_TYPE(c2); + switch (type) { + case IGNORE: + afterDigit2 = false; + continue; /* ignore all but letters and digits */ + case ZERO: + if (!afterDigit2 && name1Index < name1.length()) { + nextType = GET_CHAR_TYPE(name2.charAt(name2Index)); + if (nextType == ZERO || nextType == NONZERO) { + continue; /* ignore leading zero before another digit */ + } + } + break; + case NONZERO: + afterDigit2 = true; + break; + default: + c2 = type; /* lowercased letter */ + afterDigit2 = false; + break; + } + break; /* deliver c2 */ + } + + /* If we reach the ends of both strings then they match */ + if (name1Index >= name1.length() && name2Index >= name2.length()) { + return 0; + } + + /* Case-insensitive comparison */ + rc = (int)c1 - (int)c2; + if (rc != 0) { + return rc; + } + } + } + + static int io_countAliases(String alias) + throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + int convNum = findConverter(alias, isAmbigous); + if (convNum < gConverterList.length) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArray[(gTagList.length - 1) + * gConverterList.length + convNum]; + + if (listOffset != 0) { + return gTaggedAliasLists[listOffset]; + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return 0; + } + + /** + * Return the number of all aliases (and converter names). + * + * @return the number of all aliases + */ + // U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode); +// static int io_countTotalAliases() throws IOException{ +// if (haveAliasData()) { +// return (int) gAliasList.length; +// } +// return 0; +// } + + // U_CFUNC const char * io_getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + static String io_getAlias(String alias, int n) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + int convNum = findConverter(alias,isAmbigous); + if (convNum < gConverterList.length) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArray[(gTagList.length - 1) + * gConverterList.length + convNum]; + + if (listOffset != 0) { + //int listCount = gTaggedAliasListsArray[listOffset]; + /* +1 to skip listCount */ + int[] currListArray = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + + return GET_STRING(currListArray[currListArrayIndex + n]); + + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return null; + } + + // U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) { +// static int io_countStandards() throws IOException{ +// if (haveAliasData()) { +// return (int) (gTagList.length - NUM_HIDDEN_TAGS); +// } +// return 0; +// } + + // U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode + // *pErrorCode) +// static String getStandard(int n) throws IOException{ +// if (haveAliasData()) { +// return GET_STRING(gTagList[n]); +// } +// return null; +// } + + // U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) + static final String getStandardName(String alias, String standard)throws IOException { + if (haveAliasData() && isAlias(alias)) { + int listOffset = findTaggedAliasListsOffset(alias, standard); + + if (0 < listOffset && listOffset < gTaggedAliasLists.length) { + int[] currListArray = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + if (currListArray[0] != 0) { + return GET_STRING(currListArray[currListArrayIndex]); + } + } + } + return null; + } + + // U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode + // *pErrorCode) + static int countAliases(String alias) throws IOException{ + return io_countAliases(alias); + } + + // U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + static String getAlias(String alias, int n) throws IOException{ + return io_getAlias(alias, n); + } + + // U_CFUNC uint16_t countStandards(void) +// static int countStandards()throws IOException{ +// return io_countStandards(); +// } + + /*returns a single Name from the list, will return NULL if out of bounds + */ + static String getAvailableName (int n){ + try{ + if (0 <= n && n <= 0xffff) { + String name = bld_getAvailableConverter(n); + return name; + } + }catch(IOException ex){ + //throw away exception + } + return null; + } + // U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) { + static String getCanonicalName(String alias, String standard) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + int convNum = findTaggedConverterNum(alias, standard); + + if (convNum < gConverterList.length) { + return GET_STRING(gConverterList[convNum]); + } + } + + return null; + } + static int countAvailable (){ + try{ + return bld_countAvailableConverters(); + }catch(IOException ex){ + //throw away exception + } + return -1; + } + + // U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName, + // const char *standard, UErrorCode *pErrorCode) +/* static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException { + UConverterAliasesEnumeration aliasEnum = null; + if (haveAliasData() && isAlias(convName)) { + int listOffset = findTaggedAliasListsOffset(convName, standard); + + + * When listOffset == 0, we want to acknowledge that the converter + * name and standard are okay, but there is nothing to enumerate. + + if (listOffset < gTaggedAliasLists.length) { + + UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0); + aliasEnum = new UConverterAliasesEnumeration(); + aliasEnum.setContext(context); + } + else converter or tag not found + } + return aliasEnum; + }*/ + + // static uint32_t getTagNumber(const char *tagname) + private static int getTagNumber(String tagName) { + if (gTagList != null) { + int tagNum; + for (tagNum = 0; tagNum < gTagList.length; tagNum++) { + if (tagName.equals(GET_STRING(gTagList[tagNum]))) { + return tagNum; + } + } + } + + return Integer.MAX_VALUE; + } + + // static uint32_t findTaggedAliasListsOffset(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static int findTaggedAliasListsOffset(String alias, String standard) { + int idx; + int listOffset; + int convNum; + int tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS) + && convNum < gConverterList.length) { + listOffset = gTaggedAliasArray[tagNum + * gConverterList.length + convNum]; + if (listOffset != 0 + && gTaggedAliasLists[listOffset + 1] != 0) { + return listOffset; + } + if (isAmbigous[0]==true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search the + * whole swiss cheese starting at the highest standard affinity. + * This may take a while. + */ + + for (idx = 0; idx < gTaggedAliasArray.length; idx++) { + listOffset = gTaggedAliasArray[idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + int currTagNum = idx / gConverterList.length; + int currConvNum = (idx - currTagNum + * gConverterList.length); + int tempListOffset = gTaggedAliasArray[tagNum + * gConverterList.length + currConvNum]; + if (tempListOffset != 0 + && gTaggedAliasLists[tempListOffset + 1] != 0) { + return tempListOffset; + } + /* + * else keep on looking We could speed this up by + * starting on the next row because an alias is unique + * per row, right now. This would change if alias + * versioning appears. + */ + } + } + /* The standard doesn't know about the alias */ + } + /* else no default name */ + return 0; + } + /* else converter or tag not found */ + + return Integer.MAX_VALUE; + } + + /* Return the canonical name */ + // static uint32_t findTaggedConverterNum(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static int findTaggedConverterNum(String alias, String standard) { + int idx; + int listOffset; + int convNum; + int tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS) + && convNum < gConverterList.length) { + listOffset = gTaggedAliasArray[tagNum + * gConverterList.length + convNum]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return convNum; + } + if (isAmbigous[0] == true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search one + * slice of the swiss cheese. We search only in the requested + * tag, not the whole thing. This may take a while. + */ + int convStart = (tagNum) * gConverterList.length; + int convLimit = (tagNum + 1) * gConverterList.length; + for (idx = convStart; idx < convLimit; idx++) { + listOffset = gTaggedAliasArray[idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return idx - convStart; + } + } + /* The standard doesn't know about the alias */ + } + /* else no canonical name */ + } + /* else converter or tag not found */ + + return Integer.MAX_VALUE; + } + + // static U_INLINE UBool isAliasInList(const char *alias, uint32_t + // listOffset) + private static boolean isAliasInList(String alias, int listOffset) { + if (listOffset != 0) { + int currAlias; + int listCount = gTaggedAliasLists[listOffset]; + /* +1 to skip listCount */ + int[] currList = gTaggedAliasLists; + int currListArrayIndex = listOffset + 1; + for (currAlias = 0; currAlias < listCount; currAlias++) { + if (currList[currAlias + currListArrayIndex] != 0 + && compareNames( + alias, + GET_STRING(currList[currAlias + currListArrayIndex])) == 0) { + return true; + } + } + } + return false; + } + + // begin bld.c + static String[] gAvailableConverters = null; + + static int gAvailableConverterCount = 0; + + static byte[] gDefaultConverterNameBuffer; // [MAX_CONVERTER_NAME_LENGTH + + // 1]; /* +1 for NULL */ + + static String gDefaultConverterName = null; + + // static UBool haveAvailableConverterList(UErrorCode *pErrorCode) + static boolean haveAvailableConverterList() throws IOException{ + if (gAvailableConverters == null) { + int idx; + int localConverterCount; + String converterName; + String[] localConverterList; + + if (!haveAliasData()) { + return false; + } + + /* We can't have more than "*converterTable" converters to open */ + localConverterList = new String[gConverterList.length]; + + localConverterCount = 0; + + for (idx = 0; idx < gConverterList.length; idx++) { + converterName = GET_STRING(gConverterList[idx]); + //UConverter cnv = UConverter.open(converterName); + //TODO: Fix me + localConverterList[localConverterCount++] = converterName; + + } + + // agljport:todo umtx_lock(NULL); + if (gAvailableConverters == null) { + gAvailableConverters = localConverterList; + gAvailableConverterCount = localConverterCount; + /* haveData should have already registered the cleanup function */ + } else { + // agljport:todo free((char **)localConverterList); + } + // agljport:todo umtx_unlock(NULL); + } + return true; + } + + // U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode) + static int bld_countAvailableConverters() throws IOException{ + if (haveAvailableConverterList()) { + return gAvailableConverterCount; + } + return 0; + } + + // U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode + // *pErrorCode) + static String bld_getAvailableConverter(int n) throws IOException{ + if (haveAvailableConverterList()) { + if (n < gAvailableConverterCount) { + return gAvailableConverters[n]; + } + } + return null; + } + + /* default converter name --------------------------------------------------- */ + + /* + * In order to be really thread-safe, the get function would have to take + * a buffer parameter and copy the current string inside a mutex block. + * This implementation only tries to be really thread-safe while + * setting the name. + * It assumes that setting a pointer is atomic. + */ + + // U_CFUNC const char * getDefaultName() +// static final synchronized String getDefaultName() { +// /* local variable to be thread-safe */ +// String name; +// +// //agljport:todo umtx_lock(null); +// name = gDefaultConverterName; +// //agljport:todo umtx_unlock(null); +// +// if (name == null) { +// //UConverter cnv = null; +// int length = 0; +// +// name = CharsetICU.getDefaultCharsetName(); +// +// /* if the name is there, test it out and get the canonical name with options */ +// if (name != null) { +// // cnv = UConverter.open(name); +// // name = cnv.getName(cnv); +// // TODO: fix me +// } +// +// if (name == null || name.length() == 0 ||/* cnv == null ||*/ +// length >= gDefaultConverterNameBuffer.length) { +// /* Panic time, let's use a fallback. */ +// name = new String("US-ASCII"); +// } +// +// //length=(int32_t)(strlen(name)); +// +// /* Copy the name before we close the converter. */ +// name = gDefaultConverterName; +// } +// +// return name; +// } + + //end bld.c +} \ No newline at end of file Property changes on: lucene\src\java\com\ibm\icu\charset\UConverterAlias.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/CharsetCallback.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetCallback.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetCallback.java (revision 0) @@ -0,0 +1,408 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2010, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CoderResult; + +/** + *

Callback API for CharsetICU API

+ * + * CharsetCallback class defines some error behaviour functions called + * by CharsetDecoderICU and CharsetEncoderICU. The class also provides + * the facility by which clients can write their own callbacks. + * + * These functions, although public, should NEVER be called directly. + * They should be used as parameters to the onUmappableCharacter() and + * onMalformedInput() methods, to set the behaviour of a converter + * when it encounters UNMAPPED/INVALID sequences. + * Currently the only way to set callbacks is by using CodingErrorAction. + * In the future we will provide set methods on CharsetEncoder and CharsetDecoder + * that will accept CharsetCallback fields. + * + * @stable ICU 3.6 + */ + +public class CharsetCallback { + /* + * FROM_U, TO_U context options for sub callback + */ + private static final String SUB_STOP_ON_ILLEGAL = "i"; + +// /* +// * FROM_U, TO_U context options for skip callback +// */ +// private static final String SKIP_STOP_ON_ILLEGAL = "i"; + +// /* +// * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) +// */ +// private static final String ESCAPE_ICU = null; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) + */ + private static final String ESCAPE_JAVA = "J"; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) + * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) + */ + private static final String ESCAPE_C = "C"; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + */ + private static final String ESCAPE_XML_DEC = "D"; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + */ + private static final String ESCAPE_XML_HEX = "X"; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) + */ + private static final String ESCAPE_UNICODE = "U"; + + /* + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) + */ + private static final String ESCAPE_CSS2 = "S"; + + /** + * Decoder Callback interface + * @stable ICU 3.6 + */ + public interface Decoder { + /** + * This function is called when the bytes in the source cannot be handled, + * and this function is meant to handle or fix the error if possible. + * + * @return Result of decoding action. This returned object is set to an error + * if this function could not handle the conversion. + * @stable ICU 3.6 + */ + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr); + } + /** + * Encoder Callback interface + * @stable ICU 3.6 + */ + public interface Encoder { + /** + * This function is called when the Unicode characters in the source cannot be handled, + * and this function is meant to handle or fix the error if possible. + * @return Result of decoding action. This returned object is set to an error + * if this function could not handle the conversion. + * @stable ICU 3.6 + */ + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr); + } + /** + * Skip callback + * @stable ICU 3.6 + */ + public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + if(context==null){ + return CoderResult.UNDERFLOW; + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return CoderResult.UNDERFLOW; + } + } + return cr; + } + }; + /** + * Skip callback + * @stable ICU 3.6 + */ + public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + if(context==null){ + return CoderResult.UNDERFLOW; + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return CoderResult.UNDERFLOW; + } + } + return cr; + } + }; + /** + * Write substitute callback + * @stable ICU 3.6 + */ + public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){ + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + if(context==null){ + return encoder.cbFromUWriteSub(encoder, source, target, offsets); + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return encoder.cbFromUWriteSub(encoder, source, target, offsets); + } + } + return cr; + } + }; + private static final char[] kSubstituteChar1 = new char[]{0x1A}; + private static final char[] kSubstituteChar = new char[] {0xFFFD}; + /** + * Write substitute callback + * @stable ICU 3.6 + */ + public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + + CharsetICU cs = (CharsetICU) decoder.charset(); + /* could optimize this case, just one uchar */ + if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) { + return CharsetDecoderICU.toUWriteUChars(decoder, kSubstituteChar1, 0, 1, target, offsets, source.position()); + } else { + return CharsetDecoderICU.toUWriteUChars(decoder, kSubstituteChar, 0, 1, target, offsets, source.position()); + } + } + }; + /** + * Stop callback + * @stable ICU 3.6 + */ + public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + return cr; + } + }; + /** + * Stop callback + * @stable ICU 3.6 + */ + public static final Decoder TO_U_CALLBACK_STOP = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + return cr; + } + }; + private static final int VALUE_STRING_LENGTH = 32; + private static final char UNICODE_PERCENT_SIGN_CODEPOINT = 0x0025; + private static final char UNICODE_U_CODEPOINT = 0x0055; + private static final char UNICODE_X_CODEPOINT = 0x0058; + private static final char UNICODE_RS_CODEPOINT = 0x005C; + private static final char UNICODE_U_LOW_CODEPOINT = 0x0075; + private static final char UNICODE_X_LOW_CODEPOINT = 0x0078; + private static final char UNICODE_AMP_CODEPOINT = 0x0026; + private static final char UNICODE_HASH_CODEPOINT = 0x0023; + private static final char UNICODE_SEMICOLON_CODEPOINT = 0x003B; + private static final char UNICODE_PLUS_CODEPOINT = 0x002B; + private static final char UNICODE_LEFT_CURLY_CODEPOINT = 0x007B; + private static final char UNICODE_RIGHT_CURLY_CODEPOINT = 0x007D; + private static final char UNICODE_SPACE_CODEPOINT = 0x0020; + /** + * Write escape callback + * @stable ICU 4.0 + */ + public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + char[] valueString = new char[VALUE_STRING_LENGTH]; + int valueStringLength = 0; + int i = 0; + + cr = CoderResult.UNDERFLOW; + + if (context == null || !(context instanceof String)) { + while (i < length) { + valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ + valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ + valueStringLength += itou(valueString, valueStringLength, (int)buffer[i++] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 4); + } + } else { + if (((String)context).equals(ESCAPE_JAVA)) { + while (i < length) { + valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ + valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ + valueStringLength += itou(valueString, valueStringLength, (int)buffer[i++] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 4); + } + } else if (((String)context).equals(ESCAPE_C)) { + valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ + + if (length == 2) { + valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ + valueStringLength = itou(valueString, valueStringLength, cp, 16, 8); + } else { + valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ + valueStringLength += itou(valueString, valueStringLength, (int)buffer[0] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 4); + } + } else if (((String)context).equals(ESCAPE_XML_DEC)) { + valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ + valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ + if (length == 2) { + valueStringLength += itou(valueString, valueStringLength, cp, 10, 0); + } else { + valueStringLength += itou(valueString, valueStringLength, (int)buffer[0] & UConverterConstants.UNSIGNED_SHORT_MASK, 10, 0); + } + valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ + } else if (((String)context).equals(ESCAPE_XML_HEX)) { + valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ + valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ + valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ + if (length == 2) { + valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); + } else { + valueStringLength += itou(valueString, valueStringLength, (int)buffer[0] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 0); + } + valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ + } else if (((String)context).equals(ESCAPE_UNICODE)) { + valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ + valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ + valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT; /* adding + */ + if (length == 2) { + valueStringLength += itou(valueString, valueStringLength,cp, 16, 4); + } else { + valueStringLength += itou(valueString, valueStringLength, (int)buffer[0] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 4); + } + valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ + } else if (((String)context).equals(ESCAPE_CSS2)) { + valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ + valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); + /* Always add space character, because the next character might be whitespace, + which would erroneously be considered the termination of the escape sequence. */ + valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT; + } else { + while (i < length) { + valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ + valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ + valueStringLength += itou(valueString, valueStringLength, (int)buffer[i++] & UConverterConstants.UNSIGNED_SHORT_MASK, 16, 4); + } + } + } + + cr = encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets); + return cr; + } + }; + /** + * Write escape callback + * @stable ICU 4.0 + */ + public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + char[] uniValueString = new char[VALUE_STRING_LENGTH]; + int valueStringLength = 0; + int i = 0; + + if (context == null || !(context instanceof String)) { + while (i < length) { + uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ + uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding U */ + valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); + } + } else { + if (((String)context).equals(ESCAPE_XML_DEC)) { + while (i < length) { + uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ + uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ + valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0); + uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ + } + } else if (((String)context).equals(ESCAPE_XML_HEX)) { + while (i < length) { + uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ + uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ + uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ + valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0); + uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ + } + } else if (((String)context).equals(ESCAPE_C)) { + while (i < length) { + uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ + uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ + valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); + } + } else { + while (i < length) { + uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ + uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */ + itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); + valueStringLength += 2; + } + } + } + + cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0); + + return cr; + } + }; + /*** + * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE. + * Fills in a char string with the radix-based representation of a number padded with zeroes + * to minwidth. + */ + private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) { + int length = 0; + int digit; + int j; + char temp; + + do { + digit = i % radix; + buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7)); + i = i/radix; + } while (i != 0 && (sourceIndex + length) < buffer.length); + + while (length < minwidth) { + buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */ + } + /* reverses the string */ + for (j = 0; j < (length / 2); j++) { + temp = buffer[(sourceIndex + length - 1) - j]; + buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j]; + buffer[sourceIndex + j] = temp; + } + + return length; + } + + /* + * No need to create an instance + */ + private CharsetCallback() { + } +} Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetCallback.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/UConverterConstants.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/UConverterConstants.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/UConverterConstants.java (revision 0) @@ -0,0 +1,169 @@ +/* +******************************************************************************* +* Copyright (C) 2006-2008, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +package com.ibm.icu.charset; + +interface UConverterConstants { + + static final short UNSIGNED_BYTE_MASK = 0xff; + static final int UNSIGNED_SHORT_MASK = 0xffff; + static final long UNSIGNED_INT_MASK = 0xffffffffL; + + static final int U_IS_BIG_ENDIAN = 0; + + /** + * Useful constant for the maximum size of the whole locale ID + * (including the terminating NULL). + */ + static final int ULOC_FULLNAME_CAPACITY = 56; + + /** + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. + * + * ICU APIs designed before ICU 2.4 usually define service-specific "done" + * values, mostly 0xffff. + * Those may need to be distinguished from + * actual U+ffff text contents by calling functions like + * CharacterIterator::hasNext() or UnicodeString::length(). + */ + static final int U_SENTINEL = -1; + + //end utf.h + + //begin ucnv.h + /** + * Character that separates converter names from options and options from each other. + * @see CharsetICU#forNameICU(String) + */ + static final byte OPTION_SEP_CHAR = ','; + + /** Maximum length of a converter name including the terminating NULL */ + static final int MAX_CONVERTER_NAME_LENGTH = 60; + /** Maximum length of a converter name including path and terminating NULL */ + static final int MAX_FULL_FILE_NAME_LENGTH = (600+MAX_CONVERTER_NAME_LENGTH); + + /** Shift in for EBDCDIC_STATEFUL and iso2022 states */ + static final int SI = 0x0F; + /** Shift out for EBDCDIC_STATEFUL and iso2022 states */ + static final int SO = 0x0E; + + //end ucnv.h + + // begin bld.h + /* size of the overflow buffers in UConverter, enough for escaping callbacks */ + //#define ERROR_BUFFER_LENGTH 32 + static final int ERROR_BUFFER_LENGTH = 32; + + /* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */ + static final int MAX_SUBCHAR_LEN = 4; + + /* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */ + static final int MAX_CHAR_LEN = 8; + + /* converter options bits */ + static final int OPTION_VERSION = 0xf; + static final int OPTION_SWAP_LFNL = 0x10; + static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings + + static final String OPTION_SWAP_LFNL_STRING = ",swaplfnl"; + + /** values for the unicodeMask */ + static final int HAS_SUPPLEMENTARY = 1; + static final int HAS_SURROGATES = 2; + // end bld.h + + // begin cnv.h + /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ + static final int missingCharMarker = 0xFFFF; + /** + * + * @author ram + */ + static interface UConverterResetChoice { + static final int RESET_BOTH = 0; + static final int RESET_TO_UNICODE = RESET_BOTH + 1; + static final int RESET_FROM_UNICODE = RESET_TO_UNICODE + 1; + } + + // begin utf16.h + /** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + */ + static final int U16_MAX_LENGTH = 2; + // end utf16.h + + // begin err.h + /** + * FROM_U, TO_U context options for sub callback + */ + static byte[] SUB_STOP_ON_ILLEGAL = {'i'}; + + /** + * FROM_U, TO_U context options for skip callback + */ + static byte[] SKIP_STOP_ON_ILLEGAL = {'i'}; + + /** + * The process condition code to be used with the callbacks. + * Codes which are greater than IRREGULAR should be + * passed on to any chained callbacks. + */ + static interface UConverterCallbackReason { + static final int UNASSIGNED = 0; /**< The code point is unassigned. + The error code U_INVALID_CHAR_FOUND will be set. */ + static final int ILLEGAL = 1; /**< The code point is illegal. For example, + \\x81\\x2E is illegal in SJIS because \\x2E + is not a valid trail byte for the \\x81 + lead byte. + Also, starting with Unicode 3.0.1, non-shortest byte sequences + in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) + are also illegal, not just irregular. + The error code U_ILLEGAL_CHAR_FOUND will be set. */ + static final int IRREGULAR = 2; /**< The codepoint is not a regular sequence in + the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF + are irregular UTF-8 byte sequences for single surrogate + code points. + The error code U_INVALID_CHAR_FOUND will be set. */ + static final int RESET = 3; /**< The callback is called with this reason when a + 'reset' has occured. Callback should reset all + state. */ + static final int CLOSE = 4; /**< Called when the converter is closed. The + callback should release any allocated memory.*/ + static final int CLONE = 5; /**< Called when safeClone() is called on the + converter. the pointer available as the + 'context' is an alias to the original converters' + context pointer. If the context must be owned + by the new converter, the callback must clone + the data and call setFromUCallback + (or setToUCallback) with the correct pointer. + */ + } + //end err.h + + + static final String DATA_TYPE = "cnv"; + static final int CNV_DATA_BUFFER_SIZE = 25000; + static final int SIZE_OF_UCONVERTER_SHARED_DATA = 100; + + static final int MAXIMUM_UCS2 = 0x0000FFFF; + static final int MAXIMUM_UTF = 0x0010FFFF; + //static final int MAXIMUM_UCS4 = 0x7FFFFFFF; + static final int HALF_SHIFT = 10; + static final int HALF_BASE = 0x0010000; + static final int HALF_MASK = 0x3FF; + static final int SURROGATE_HIGH_START = 0xD800; + static final int SURROGATE_HIGH_END = 0xDBFF; + static final int SURROGATE_LOW_START = 0xDC00; + static final int SURROGATE_LOW_END = 0xDFFF; + + /* -SURROGATE_LOW_START + HALF_BASE */ + static final int SURROGATE_LOW_BASE = 9216; +} Property changes on: lucene\src\java\com\ibm\icu\charset\UConverterConstants.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/CharsetICU.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetICU.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetICU.java (revision 0) @@ -0,0 +1,138 @@ +/** +******************************************************************************* +* Copyright (C) 2006-2009, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; + +/** + *

A subclass of java.nio.Charset for providing implementation of ICU's charset converters. + * This API is used to convert codepage or character encoded data to and + * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that + * converter, you can get its properties, set options, convert your data.

+ * + *

Since many software programs recogize different converter names for + * different types of converters, there are other functions in this API to + * iterate over the converter aliases. + * + * @stable ICU 3.6 + */ +public abstract class CharsetICU extends Charset{ + + String icuCanonicalName; + String javaCanonicalName; + int options; + + float maxCharsPerByte; + + String name; /* +4: 60 internal name of the converter- invariant chars */ + + int codepage; /* +64: 4 codepage # (now IBM-$codepage) */ + + byte platform; /* +68: 1 platform of the converter (only IBM now) */ + byte conversionType; /* +69: 1 conversion type */ + + int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */ + int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */ + + byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */ + byte subCharLen; /* +76: 1 */ + + byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */ + byte hasFromUnicodeFallback; /* +78: 1 */ + short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */ + byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */ + //byte reserved[/*19*/]; /* +81: 19 to round out the structure */ + + + // typedef enum UConverterUnicodeSet { + /** + * Parameter that select the set of roundtrippable Unicode code points. + * @stable ICU 4.0 + */ + public static final int ROUNDTRIP_SET=0; + /** + * Select the set of Unicode code points with roundtrip or fallback mappings. + * Not supported at this point. + * @internal + * @deprecated This API is ICU internal only. + */ + public static final int ROUNDTRIP_AND_FALLBACK_SET =1; + + //} UConverterUnicodeSet; + + /** + * + * @param icuCanonicalName + * @param canonicalName + * @param aliases + * @stable ICU 3.6 + */ + protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) { + super(canonicalName,aliases); + if(canonicalName.length() == 0){ + throw new IllegalCharsetNameException(canonicalName); + } + this.javaCanonicalName = canonicalName; + this.icuCanonicalName = icuCanonicalName; + } + + /** + * Ascertains if a charset is a sub set of this charset + * Implements the abstract method of super class. + * @param cs charset to test + * @return true if the given charset is a subset of this charset + * @stable ICU 3.6 + */ + public boolean contains(Charset cs){ + if (null == cs) { + return false; + } else if (this.equals(cs)) { + return true; + } + return false; + } + + static final boolean isSurrogate(int c){ + return (((c)&0xfffff800)==0xd800); + } + + /* + * Returns the default charset name + */ +// static final String getDefaultCharsetName(){ +// String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); +// return defaultEncoding; +// } + + /** + * Returns a charset object for the named charset. + * This method gurantee that ICU charset is returned when + * available. If the ICU charset provider does not support + * the specified charset, then try other charset providers + * including the standard Java charset provider. + * + * @param charsetName The name of the requested charset, + * may be either a canonical name or an alias + * @return A charset object for the named charset + * @throws IllegalCharsetNameException If the given charset name + * is illegal + * @throws UnsupportedCharsetException If no support for the + * named charset is available in this instance of th Java + * virtual machine + * @stable ICU 3.6 + */ + public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException { + if (charsetName.equals("BOCU-1")) + return new CharsetBOCU1("BOCU-1", "BOCU-1", new String[] { "BOCU1" }); + return null; + } +} Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetICU.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java =================================================================== --- lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java (revision 0) +++ lucene/src/java/com/ibm/icu/charset/CharsetEncoderICU.java (revision 0) @@ -0,0 +1,916 @@ +/** + ******************************************************************************* + * Copyright (C) 2006-2009, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + ******************************************************************************* + */ + +package com.ibm.icu.charset; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +/** + * An abstract class that provides framework methods of decoding operations for concrete + * subclasses. + * In the future this class will contain API that will implement converter semantics of ICU4C. + * @stable ICU 3.6 + */ +public abstract class CharsetEncoderICU extends CharsetEncoder { + + /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ + static final char MISSING_CHAR_MARKER = '\uFFFF'; + + byte[] errorBuffer = new byte[30]; + + int errorBufferLength = 0; + + /** these are for encodeLoopICU */ + int fromUnicodeStatus; + + int fromUChar32; + + boolean useSubChar1; + + boolean useFallback; + + /* maximum number of indexed UChars */ + static final int EXT_MAX_UCHARS = 19; + + /* store previous UChars/chars to continue partial matches */ + int preFromUFirstCP; /* >=0: partial match */ + + char[] preFromUArray = new char[EXT_MAX_UCHARS]; + + int preFromUBegin; + + int preFromULength; /* negative: replay */ + + char[] invalidUCharBuffer = new char[2]; + + int invalidUCharLength; + + Object fromUContext; + + private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP; + + private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP; + + CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr) { + if (cr.isUnmappable()) { + return onUnmappableInput.call(encoder, context, source, target, + offsets, buffer, length, cp, cr); + } else /* if (cr.isMalformed()) */ { + return onMalformedInput.call(encoder, context, source, target, + offsets, buffer, length, cp, cr); + } + // return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context, source, target, offsets, buffer, length, cp, cr); + + } + }; + + /* + * Construcs a new encoder for the given charset + * + * @param cs + * for which the decoder is created + * @param replacement + * the substitution bytes + */ + CharsetEncoderICU(CharsetICU cs, byte[] replacement) { + super(cs, (cs.minBytesPerChar + cs.maxBytesPerChar) / 2, + cs.maxBytesPerChar, replacement); + } + + /** + * Is this Encoder allowed to use fallbacks? A fallback mapping is a mapping + * that will convert a Unicode codepoint sequence to a byte sequence, but + * the encoded byte sequence will round trip convert to a different + * Unicode codepoint sequence. + * @return true if the converter uses fallback, false otherwise. + * @stable ICU 3.8 + */ + public boolean isFallbackUsed() { + return useFallback; + } + + /** + * Sets whether this Encoder can use fallbacks? + * @param usesFallback true if the user wants the converter to take + * advantage of the fallback mapping, false otherwise. + * @stable ICU 3.8 + */ + public void setFallbackUsed(boolean usesFallback) { + useFallback = usesFallback; + } + + /* + * Use fallbacks from Unicode to codepage when useFallback or for private-use code points + * @param c A codepoint + */ + final boolean isFromUUseFallback(int c) { + return (useFallback) + || (Character.getType(c) == Character.PRIVATE_USE); + } + + /** + * Use fallbacks from Unicode to codepage when useFallback or for private-use code points + */ + static final boolean isFromUUseFallback(boolean iUseFallback, int c) { + return (iUseFallback) + || (Character.getType(c) == Character.PRIVATE_USE); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * + * @param newAction + * action to be taken + * @exception IllegalArgumentException + * @stable ICU 3.6 + */ + protected void implOnMalformedInput(CodingErrorAction newAction) { + onMalformedInput = getCallback(newAction); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * + * @param newAction + * action to be taken + * @exception IllegalArgumentException + * @stable ICU 3.6 + */ + protected void implOnUnmappableCharacter(CodingErrorAction newAction) { + onUnmappableInput = getCallback(newAction); + } + + /** + * Sets the callback encoder method and context to be used if an illegal sequence is encountered. + * You would normally call this twice to set both the malform and unmappable error. In this case, + * newContext should remain the same since using a different newContext each time will negate the last + * one used. + * @param err CoderResult + * @param newCallback CharsetCallback.Encoder + * @param newContext Object + * @stable ICU 4.0 + */ + public final void setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext) { + if (err.isMalformed()) { + onMalformedInput = newCallback; + } else if (err.isUnmappable()) { + onUnmappableInput = newCallback; + } else { + /* Error: Only malformed and unmappable are handled. */ + } + + if (fromUContext == null || !fromUContext.equals(newContext)) { + setFromUContext(newContext); + } + } + + /** + * Sets fromUContext used in callbacks. + * + * @param newContext Object + * @exception IllegalArgumentException The object is an illegal argument for UContext. + * @stable ICU 4.0 + */ + public final void setFromUContext(Object newContext) { + fromUContext = newContext; + } + + private static CharsetCallback.Encoder getCallback(CodingErrorAction action) { + if (action == CodingErrorAction.REPLACE) { + return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE; + } else if (action == CodingErrorAction.IGNORE) { + return CharsetCallback.FROM_U_CALLBACK_SKIP; + } else /* if (action == CodingErrorAction.REPORT) */ { + return CharsetCallback.FROM_U_CALLBACK_STOP; + } + } + + private static final CharBuffer EMPTY = CharBuffer.allocate(0); + + /** + * Flushes any characters saved in the converter's internal buffer and + * resets the converter. + * @param out action to be taken + * @return result of flushing action and completes the decoding all input. + * Returns CoderResult.UNDERFLOW if the action succeeds. + * @stable ICU 3.6 + */ + protected CoderResult implFlush(ByteBuffer out) { + return encode(EMPTY, out, null, true); + } + + /** + * Resets the from Unicode mode of converter + * @stable ICU 3.6 + */ + protected void implReset() { + errorBufferLength = 0; + fromUnicodeStatus = 0; + fromUChar32 = 0; + fromUnicodeReset(); + } + + private void fromUnicodeReset() { + preFromUBegin = 0; + preFromUFirstCP = UConverterConstants.U_SENTINEL; + preFromULength = 0; + } + + /** + * Encodes one or more chars. The default behaviour of the + * converter is stop and report if an error in input stream is encountered. + * To set different behaviour use @see CharsetEncoder.onMalformedInput() + * @param in buffer to decode + * @param out buffer to populate with decoded result + * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + * @stable ICU 3.6 + */ + protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { + if (!in.hasRemaining() && this.errorBufferLength == 0) { // make sure the errorBuffer is empty + // The Java framework should have already substituted what was left. + fromUChar32 = 0; + //fromUnicodeReset(); + return CoderResult.UNDERFLOW; + } + in.position(in.position() + fromUCountPending()); + /* do the conversion */ + CoderResult ret = encode(in, out, null, false); + setSourcePosition(in); + /* No need to reset to keep the proper state of the encoder. + if (ret.isUnderflow() && in.hasRemaining()) { + // The Java framework is going to substitute what is left. + //fromUnicodeReset(); + } */ + return ret; + } + + /* + * Implements ICU semantics of buffer management + * @param source + * @param target + * @param offsets + * @return A CoderResult object that contains the error result when an error occurs. + */ + abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target, + IntBuffer offsets, boolean flush); + + /* + * Implements ICU semantics for encoding the buffer + * @param source The input character buffer + * @param target The output byte buffer + * @param offsets + * @param flush true if, and only if, the invoker can provide no + * additional input bytes beyond those in the given buffer. + * @return A CoderResult object that contains the error result when an error occurs. + */ + final CoderResult encode(CharBuffer source, ByteBuffer target, + IntBuffer offsets, boolean flush) { + + /* check parameters */ + if (target == null || source == null) { + throw new IllegalArgumentException(); + } + + /* + * Make sure that the buffer sizes do not exceed the number range for + * int32_t because some functions use the size (in units or bytes) + * rather than comparing pointers, and because offsets are int32_t values. + * + * size_t is guaranteed to be unsigned and large enough for the job. + * + * Return with an error instead of adjusting the limits because we would + * not be able to maintain the semantics that either the source must be + * consumed or the target filled (unless an error occurs). + * An adjustment would be targetLimit=t+0x7fffffff; for example. + */ + + /* flush the target overflow buffer */ + if (errorBufferLength > 0) { + byte[] overflowArray; + int i, length; + + overflowArray = errorBuffer; + length = errorBufferLength; + i = 0; + do { + if (target.remaining() == 0) { + /* the overflow buffer contains too much, keep the rest */ + int j = 0; + + do { + overflowArray[j++] = overflowArray[i++]; + } while (i < length); + + errorBufferLength = (byte) j; + return CoderResult.OVERFLOW; + } + + /* copy the overflow contents to the target */ + target.put(overflowArray[i++]); + if (offsets != null) { + offsets.put(-1); /* no source index available for old output */ + } + } while (i < length); + + /* the overflow buffer is completely copied to the target */ + errorBufferLength = 0; + } + + if (!flush && source.remaining() == 0 && preFromULength >= 0) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return CoderResult.UNDERFLOW; + } + + /* + * Do not simply return with a buffer overflow error if + * !flush && t==targetLimit + * because it is possible that the source will not generate any output. + * For example, the skip callback may be called; + * it does not output anything. + */ + + return fromUnicodeWithCallback(source, target, offsets, flush); + + } + + /* + * Implementation note for m:n conversions + * + * While collecting source units to find the longest match for m:n conversion, + * some source units may need to be stored for a partial match. + * When a second buffer does not yield a match on all of the previously stored + * source units, then they must be "replayed", i.e., fed back into the converter. + * + * The code relies on the fact that replaying will not nest - + * converting a replay buffer will not result in a replay. + * This is because a replay is necessary only after the _continuation_ of a + * partial match failed, but a replay buffer is converted as a whole. + * It may result in some of its units being stored again for a partial match, + * but there will not be a continuation _during_ the replay which could fail. + * + * It is conceivable that a callback function could call the converter + * recursively in a way that causes another replay to be stored, but that + * would be an error in the callback function. + * Such violations will cause assertion failures in a debug build, + * and wrong output, but they will not cause a crash. + */ + final CoderResult fromUnicodeWithCallback(CharBuffer source, + ByteBuffer target, IntBuffer offsets, boolean flush) { + int sBufferIndex; + int sourceIndex; + int errorInputLength; + boolean converterSawEndOfInput, calledCallback; + + /* variables for m:n conversion */ + CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS); + int replayArrayIndex = 0; + CharBuffer realSource; + boolean realFlush; + + CoderResult cr = CoderResult.UNDERFLOW; + + /* get the converter implementation function */ + sourceIndex = 0; + + if (preFromULength >= 0) { + /* normal mode */ + realSource = null; + realFlush = false; + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource = source; + realFlush = flush; + + //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); + replayArray.put(preFromUArray, 0, -preFromULength); + source = replayArray; + source.position(replayArrayIndex); + source.limit(replayArrayIndex - preFromULength); //preFromULength is negative, see declaration + flush = false; + + preFromULength = 0; + } + + /* + * loop for conversion and error handling + * + * loop { + * convert + * loop { + * update offsets + * handle end of input + * handle errors/call callback + * } + * } + */ + for (;;) { + /* convert */ + cr = encodeLoop(source, target, offsets, flush); + /* + * set a flag for whether the converter + * successfully processed the end of the input + * + * need not check cnv.preFromULength==0 because a replay (<0) will cause + * s 0) { + + /* + * if a converter handles offsets and updates the offsets + * pointer at the end, then offset should not change + * here; + * however, some converters do not handle offsets at all + * (sourceIndex<0) or may not update the offsets pointer + */ + /* offsets.position(offsets.position() + length); + } + + if (sourceIndex >= 0) { + sourceIndex += (int) (source.position()); + } + } */ + + if (preFromULength < 0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if (realSource == null) { + realSource = source; + realFlush = flush; + + //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); + replayArray.put(preFromUArray, 0, -preFromULength); + + source = replayArray; + source.position(replayArrayIndex); + source.limit(replayArrayIndex - preFromULength); + flush = false; + if ((sourceIndex += preFromULength) < 0) { + sourceIndex = -1; + } + + preFromULength = 0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + //agljport:todo U_ASSERT(realSource==NULL); + assert realSource == null; + } + } + + /* update pointers */ + sBufferIndex = source.position(); + if (cr.isUnderflow()) { + if (sBufferIndex < source.limit()) { + /* + * continue with the conversion loop while there is still input left + * (continue converting by breaking out of only the inner loop) + */ + break; + } else if (realSource != null) { + /* switch back from replaying to the real source and continue */ + source = realSource; + flush = realFlush; + sourceIndex = source.position(); + realSource = null; + break; + } else if (flush && fromUChar32 != 0) { + /* + * the entire input stream is consumed + * and there is a partial, truncated input sequence left + */ + + /* inject an error and continue with callback handling */ + //err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND; + cr = CoderResult.malformedForLength(1); + calledCallback = false; /* new error condition */ + } else { + /* input consumed */ + if (flush) { + /* + * return to the conversion loop once more if the flush + * flag is set and the conversion function has not + * successfully processed the end of the input yet + * + * (continue converting by breaking out of only the inner loop) + */ + if (!converterSawEndOfInput) { + break; + } + + /* reset the converter without calling the callback function */ + implReset(); + } + + /* done successfully */ + return cr; + } + } + + /*U_FAILURE(*err) */ + { + + if (calledCallback || cr.isOverflow() + || (!cr.isMalformed() && !cr.isUnmappable())) { + /* + * the callback did not or cannot resolve the error: + * set output pointers and return + * + * the check for buffer overflow is redundant but it is + * a high-runner case and hopefully documents the intent + * well + * + * if we were replaying, then the replay buffer must be + * copied back into the UConverter + * and the real arguments must be restored + */ + if (realSource != null) { + int length; + + //agljport:todo U_ASSERT(cnv.preFromULength==0); + + length = source.remaining(); + if (length > 0) { + //UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR); + source.get(preFromUArray, 0, length); + preFromULength = (byte) -length; + } + source = realSource; + flush = realFlush; + } + return cr; + } + } + + /* callback handling */ + { + int codePoint; + + /* get and write the code point */ + codePoint = fromUChar32; + errorInputLength = UTF16.append(invalidUCharBuffer, 0, + fromUChar32); + invalidUCharLength = errorInputLength; + + /* set the converter state to deal with the next character */ + fromUChar32 = 0; + + /* call the callback function */ + cr = fromCharErrorBehaviour.call(this, fromUContext, + source, target, offsets, invalidUCharBuffer, + invalidUCharLength, codePoint, cr); + } + + /* + * loop back to the offset handling + * + * this flag will indicate after offset handling + * that a callback was called; + * if the callback did not resolve the error, then we return + */ + calledCallback = true; + } + } + } + + /* + * Ascertains if a given Unicode code point (32bit value for handling surrogates) + * can be converted to the target encoding. If the caller wants to test if a + * surrogate pair can be converted to target encoding then the + * responsibility of assembling the int value lies with the caller. + * For assembling a code point the caller can use UTF16 class of ICU4J and do something like: + *

+     *  while(i
+     * or
+     * 
+     *  String src = new String(mySource);
+     *  int i,codepoint;
+     *  boolean passed = false;
+     *  while(i0xfff)? 2:1;
+     *      if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
+     *          passed = false;
+     *      }
+     *  }
+     * 
+ * + * @param codepoint Unicode code point as int value + * @return true if a character can be converted + */ + /* TODO This is different from Java's canEncode(char) API. + * ICU's API should implement getUnicodeSet, + * and override canEncode(char) which queries getUnicodeSet. + * The getUnicodeSet should return a frozen UnicodeSet or use a fillin parameter, like ICU4C. + */ + /*public boolean canEncode(int codepoint) { + return true; + }*/ + /** + * Overrides super class method + * @stable ICU 3.6 + */ + public boolean isLegalReplacement(byte[] repl) { + return true; + } + + /* + * Writes out the specified output bytes to the target byte buffer or to converter internal buffers. + * @param cnv + * @param bytesArray + * @param bytesBegin + * @param bytesLength + * @param out + * @param offsets + * @param sourceIndex + * @return A CoderResult object that contains the error result when an error occurs. + */ + static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv, + byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out, + IntBuffer offsets, int sourceIndex) { + + //write bytes + int obl = bytesLength; + CoderResult cr = CoderResult.UNDERFLOW; + int bytesLimit = bytesBegin + bytesLength; + try { + for (; bytesBegin < bytesLimit;) { + out.put(bytesArray[bytesBegin]); + bytesBegin++; + } + // success + bytesLength = 0; + } catch (BufferOverflowException ex) { + cr = CoderResult.OVERFLOW; + } + + if (offsets != null) { + while (obl > bytesLength) { + offsets.put(sourceIndex); + --obl; + } + } + //write overflow + cnv.errorBufferLength = bytesLimit - bytesBegin; + if (cnv.errorBufferLength > 0) { + int index = 0; + while (bytesBegin < bytesLimit) { + cnv.errorBuffer[index++] = bytesArray[bytesBegin++]; + } + cr = CoderResult.OVERFLOW; + } + return cr; + } + + /* + * Returns the number of chars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @return The number of chars in the state. -1 if an error is encountered. + */ + /*public*/int fromUCountPending() { + if (preFromULength > 0) { + return UTF16.getCharCount(preFromUFirstCP) + preFromULength; + } else if (preFromULength < 0) { + return -preFromULength; + } else if (fromUChar32 > 0) { + return 1; + } else if (preFromUFirstCP > 0) { + return UTF16.getCharCount(preFromUFirstCP); + } + return 0; + } + + /** + * + * @param source + */ + private final void setSourcePosition(CharBuffer source) { + + // ok was there input held in the previous invocation of encodeLoop + // that resulted in output in this invocation? + source.position(source.position() - fromUCountPending()); + } + + /* + * Write the codepage substitution character. + * Subclasses to override this method. + * For stateful converters, it is typically necessary to handle this + * specificially for the converter in order to properly maintain the state. + * @param source The input character buffer + * @param target The output byte buffer + * @param offsets + * @return A CoderResult object that contains the error result when an error occurs. + */ + CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, + ByteBuffer target, IntBuffer offsets) { + CharsetICU cs = (CharsetICU) encoder.charset(); + byte[] sub = encoder.replacement(); + if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) { + return CharsetEncoderICU.fromUWriteBytes(encoder, + new byte[] { cs.subChar1 }, 0, 1, target, offsets, source + .position()); + } else { + return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0, + sub.length, target, offsets, source.position()); + } + } + + /* + * Write the characters to target. + * @param source The input character buffer + * @param target The output byte buffer + * @param offsets + * @return A CoderResult object that contains the error result when an error occurs. + */ + CoderResult cbFromUWriteUChars(CharsetEncoderICU encoder, + CharBuffer source, ByteBuffer target, IntBuffer offsets) { + CoderResult cr = CoderResult.UNDERFLOW; + + /* This is a fun one. Recursion can occur - we're basically going to + * just retry shoving data through the same converter. Note, if you got + * here through some kind of invalid sequence, you maybe should emit a + * reset sequence of some kind. Since this IS an actual conversion, + * take care that you've changed the callback or the data, or you'll + * get an infinite loop. + */ + + int oldTargetPosition = target.position(); + int offsetIndex = source.position(); + + cr = encoder.encode(source, target, null, false); /* no offsets and no flush */ + + if (offsets != null) { + while (target.position() != oldTargetPosition) { + offsets.put(offsetIndex); + oldTargetPosition++; + } + } + + /* Note, if you did something like used a stop subcallback, things would get interesting. + * In fact, here's where we want to return the partially consumed in-source! + */ + if (cr.isOverflow()) { + /* Overflowed target. Now, we'll write into the charErrorBuffer. + * It's a fixed size. If we overflow it...Hm + */ + + /* start the new target at the first free slot in the error buffer */ + int errBuffLen = encoder.errorBufferLength; + ByteBuffer newTarget = ByteBuffer.wrap(encoder.errorBuffer); + newTarget.position(errBuffLen); /* set the position at the end of the error buffer */ + encoder.errorBufferLength = 0; + + encoder.encode(source, newTarget, null, false); + + encoder.errorBuffer = newTarget.array(); + encoder.errorBufferLength = newTarget.position(); + } + + return cr; + } + + /** + *

+ * Handles a common situation where a character has been read and it may be + * a lead surrogate followed by a trail surrogate. This method can change + * the source position and will modify fromUChar32. + *

+ * + *

+ * If null is returned, then there was success in reading a + * surrogate pair, the codepoint is stored in fromUChar32 and + * fromUChar32 should be reset (to 0) after being read. + *

+ * + * @param source + * The encoding source. + * @param lead + * A character that may be the first in a surrogate pair. + * @return CoderResult.malformedForLength(1) or + * CoderResult.UNDERFLOW if there is a problem, or + * null if there isn't. + * @see #handleSurrogates(CharBuffer, char) + * @see #handleSurrogates(CharBuffer, int, char) + * @see #handleSurrogates(char[], int, int, char) + */ + final CoderResult handleSurrogates(CharBuffer source, char lead) { + if (!Character.isHighSurrogate(lead)) { + fromUChar32 = lead; + return CoderResult.malformedForLength(1); + } + + if (!source.hasRemaining()) { + fromUChar32 = lead; + return CoderResult.UNDERFLOW; + } + + char trail = source.get(); + + if (!Character.isLowSurrogate(trail)) { + fromUChar32 = lead; + source.position(source.position() - 1); + return CoderResult.malformedForLength(1); + } + + fromUChar32 = Character.toCodePoint(lead, trail); + return null; + } + + /** + *

+ * Same as handleSurrogates(CharBuffer, char), but with arrays. As an added + * requirement, the calling method must also increment the index if this method returns + * null. + *

+ * + * + * @param source + * The encoding source. + * @param lead + * A character that may be the first in a surrogate pair. + * @return CoderResult.malformedForLength(1) or + * CoderResult.UNDERFLOW if there is a problem, or null if + * there isn't. + * @see #handleSurrogates(CharBuffer, char) + * @see #handleSurrogates(CharBuffer, int, char) + * @see #handleSurrogates(char[], int, int, char) + */ + final CoderResult handleSurrogates(char[] sourceArray, int sourceIndex, + int sourceLimit, char lead) { + if (!Character.isHighSurrogate(lead)) { + fromUChar32 = lead; + return CoderResult.malformedForLength(1); + } + + if (sourceIndex >= sourceLimit) { + fromUChar32 = lead; + return CoderResult.UNDERFLOW; + } + + char trail = sourceArray[sourceIndex]; + + if (!Character.isLowSurrogate(trail)) { + fromUChar32 = lead; + return CoderResult.malformedForLength(1); + } + + fromUChar32 = Character.toCodePoint(lead, trail); + return null; + } +} Property changes on: lucene\src\java\com\ibm\icu\charset\CharsetEncoderICU.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java =================================================================== --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (revision 966583) +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndTruncQuery.java (working copy) @@ -107,7 +107,7 @@ while(text != null) { if (text != null && text.startsWith(prefixRef)) { - String textString = text.utf8ToString(); + String textString = text.bocu1ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); Index: lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java =================================================================== --- lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (revision 966583) +++ lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/surround/query/SrndPrefixQuery.java (working copy) @@ -68,7 +68,7 @@ mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { if (termsEnum.term().startsWith(prefixRef)) { - mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString())); + mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().bocu1ToString())); } else { skip = true; } @@ -81,7 +81,7 @@ while(true) { BytesRef text = termsEnum.next(); if (text != null && text.startsWith(prefixRef)) { - mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString())); + mtv.visitMatchingTerm(new Term(fieldName, text.bocu1ToString())); } else { break; } Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java (revision 966583) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexHTML.java (working copy) @@ -122,7 +122,7 @@ if (deleting) { // delete rest of stale docs BytesRef text; while ((text=uidIter.next()) != null) { - String termText = text.utf8ToString(); + String termText = text.bocu1ToString(); System.out.println("deleting " + HTMLDocument.uid2url(termText)); reader.deleteDocuments(new Term("uid", termText)); @@ -153,7 +153,7 @@ BytesRef text; while((text = uidIter.next()) != null) { - String termText = text.utf8ToString(); + String termText = text.bocu1ToString(); if (termText.compareTo(uid) < 0) { if (deleting) { // delete stale docs System.out.println("deleting " + @@ -165,7 +165,7 @@ } } if (text != null && - text.utf8ToString().compareTo(uid) == 0) { + text.bocu1ToString().compareTo(uid) == 0) { uidIter.next(); // keep matching docs } else if (!deleting) { // add new docs Document doc = HTMLDocument.Document(file); Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 966583) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy) @@ -426,7 +426,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } }; } Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java (revision 966583) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java (working copy) @@ -234,7 +234,7 @@ TermsEnum termsEnum = fieldsEnum.terms(); BytesRef text; while((text = termsEnum.next()) != null) { - String termText = text.utf8ToString(); + String termText = text.bocu1ToString(); InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText); getTermsByFieldAndText().get(field).put(termText, instantiatedTerm); instantiatedTerm.setTermIndex(terms.size()); @@ -290,7 +290,7 @@ TermPositionVector termPositionVector = (TermPositionVector) sourceIndexReader.getTermFreqVector(document.getDocumentNumber(), field.name()); if (termPositionVector != null) { for (int i = 0; i < termPositionVector.getTerms().length; i++) { - String token = termPositionVector.getTerms()[i].utf8ToString(); + String token = termPositionVector.getTerms()[i].bocu1ToString(); InstantiatedTerm term = findTerm(field.name(), token); InstantiatedTermDocumentInformation termDocumentInformation = term.getAssociatedDocument(document.getDocumentNumber()); termDocumentInformation.setTermOffsets(termPositionVector.getOffsets(i)); Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (revision 966583) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (working copy) @@ -123,7 +123,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (revision 966583) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (working copy) @@ -95,7 +95,7 @@ while (termsEnum.next() != null) { int df = termsEnum.docFreq(); if (df(); } for (int tp = 0; tp < offsets.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp] + Token token = new Token(terms[t].bocu1ToString(), offsets[tp].getStartOffset(), offsets[tp] .getEndOffset()); unsortedTokens.add(token); } @@ -221,7 +221,7 @@ // tokens stored with positions - can use this to index straight into // sorted array for (int tp = 0; tp < pos.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), + Token token = new Token(terms[t].bocu1ToString(), offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); tokensInOriginalOrder[pos[tp]] = token; } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 966583) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy) @@ -82,14 +82,14 @@ if( termSet == null ) return; for( BytesRef term : tpv.getTerms() ){ - if( !termSet.contains( term.utf8ToString() ) ) continue; + if( !termSet.contains( term.bocu1ToString() ) ) continue; int index = tpv.indexOf( term ); TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term.utf8ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + termList.add( new TermInfo( term.bocu1ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); } // sort by position Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (revision 966583) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (working copy) @@ -74,7 +74,7 @@ return null; } - String result = pendingTerm.utf8ToString(); + String result = pendingTerm.bocu1ToString(); try { pendingTerm = termsEnum.next(); Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 966583) +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -807,7 +807,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } @Override @@ -901,7 +901,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); + return BytesRef.getBOCU1SortedAsUnicodeComparator(); } } Index: lucene/contrib/lucli/src/java/lucli/LuceneMethods.java =================================================================== --- lucene/contrib/lucli/src/java/lucli/LuceneMethods.java (revision 966583) +++ lucene/contrib/lucli/src/java/lucli/LuceneMethods.java (working copy) @@ -356,7 +356,7 @@ //message(term.field() + ":" + term.text() + " freq:" + terms.docFreq()); //if we're either not looking by field or we're matching the specific field if ((field == null) || field.equals(curField)) { - termMap.put(curField + ":" + text.utf8ToString(), Integer.valueOf((terms.docFreq()))); + termMap.put(curField + ":" + text.bocu1ToString(), Integer.valueOf((terms.docFreq()))); } } } Index: lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (revision 966583) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (working copy) @@ -852,7 +852,7 @@ BytesRef[] terms = vector.getTerms(); int freqs[]=vector.getTermFrequencies(); for (int j = 0; j < terms.length; j++) { - String term = terms[j].utf8ToString(); + String term = terms[j].bocu1ToString(); if(isNoiseWord(term)){ continue; Index: lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java (revision 966583) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java (working copy) @@ -55,7 +55,7 @@ if (term.startsWith(prefixRef)) { // TODO: set BoostAttr based on distance of // searchTerm.text() and term().text() - String text = term.utf8ToString(); + String text = term.bocu1ToString(); return regexImpl.match(text) ? AcceptStatus.YES : AcceptStatus.NO; } else { return AcceptStatus.NO;