Index: solr/src/java/org/apache/solr/search/QueryParsing.java =================================================================== --- solr/src/java/org/apache/solr/search/QueryParsing.java (revision 1075413) +++ solr/src/java/org/apache/solr/search/QueryParsing.java (working copy) @@ -398,8 +398,8 @@ String fname = q.getField(); FieldType ft = writeFieldName(fname, schema, out, flags); out.append(q.includesLower() ? '[' : '{'); - String lt = q.getLowerTerm().utf8ToString(); - String ut = q.getUpperTerm().utf8ToString(); + String lt = q.getLowerTerm().toString(); + String ut = q.getUpperTerm().toString(); if (lt == null) { out.append('*'); } else { Index: solr/src/webapp/web/admin/analysis.jsp =================================================================== --- solr/src/webapp/web/admin/analysis.jsp (revision 1075413) +++ solr/src/webapp/web/admin/analysis.jsp (working copy) @@ -433,7 +433,7 @@ printRow(out, "raw bytes", TermToBytesRefAttribute.class.getName(), arr, new TokToStr() { public String toStr(Tok t) { - return t.bytes.toString(); + return t.bytes.bytesToString(); } },true,verbose,match); Index: lucene/src/java/org/apache/lucene/search/TermRangeQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermRangeQuery.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/search/TermRangeQuery.java (working copy) @@ -122,9 +122,9 @@ } buffer.append(includeLower ? '[' : '{'); // TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8! - buffer.append(lowerTerm != null ? ("*".equals(lowerTerm.utf8ToString()) ? "\\*" : lowerTerm.utf8ToString()) : "*"); + buffer.append(lowerTerm != null ? ("*".equals(lowerTerm.toString()) ? "\\*" : lowerTerm.toString()) : "*"); buffer.append(" TO "); - buffer.append(upperTerm != null ? ("*".equals(upperTerm.utf8ToString()) ? "\\*" : upperTerm.utf8ToString()) : "*"); + buffer.append(upperTerm != null ? ("*".equals(upperTerm.toString()) ? "\\*" : upperTerm.toString()) : "*"); buffer.append(includeUpper ? ']' : '}'); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); Index: lucene/src/java/org/apache/lucene/search/QueryTermVector.java =================================================================== --- lucene/src/java/org/apache/lucene/search/QueryTermVector.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/search/QueryTermVector.java (working copy) @@ -116,7 +116,7 @@ sb.append('{'); for (int i=0; i0) sb.append(", "); - sb.append(terms[i].utf8ToString()).append('/').append(termFreqs[i]); + sb.append(terms[i].toString()).append('/').append(termFreqs[i]); } sb.append('}'); return sb.toString(); Index: lucene/src/java/org/apache/lucene/index/Term.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Term.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/index/Term.java (working copy) @@ -198,5 +198,5 @@ } @Override - public final String toString() { return field + ":" + bytes.utf8ToString(); } + public final String toString() { return field + ":" + bytes; } } Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (working copy) @@ -294,7 +294,7 @@ } else if (scratch.startsWith(PAYLOAD)) { // skip } else { - assert scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.startsWith(END): "scratch=" + scratch.utf8ToString(); + assert scratch.startsWith(TERM) || scratch.startsWith(FIELD) || scratch.startsWith(END): "scratch=" + scratch; if (!first && (skipDocs == null || !skipDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { @@ -398,7 +398,7 @@ @Override public int nextPosition() throws IOException { readLine(in, scratch); - assert scratch.startsWith(POS): "got line=" + scratch.utf8ToString(); + assert scratch.startsWith(POS): "got line=" + scratch; UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2); final int pos = ArrayUtil.parseInt(scratchUTF16_2.result, 0, scratchUTF16_2.length); final long fp = in.getFilePointer(); Index: lucene/src/java/org/apache/lucene/util/automaton/fst/ByteSequenceOutputs.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/fst/ByteSequenceOutputs.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/util/automaton/fst/ByteSequenceOutputs.java (working copy) @@ -132,6 +132,6 @@ @Override public String outputToString(BytesRef output) { - return output.utf8ToString(); + return output.toString(); } } Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -22,6 +22,7 @@ import java.io.ObjectInput; import java.io.ObjectOutput; import java.io.IOException; +import java.nio.charset.CharacterCodingException; /** Represents byte[], as a slice (offset + length) into an * existing byte[]. @@ -208,8 +209,7 @@ } /** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */ - @Override - public String toString() { + public String bytesToString() { StringBuilder sb = new StringBuilder(); sb.append('['); final int end = offset + length; @@ -222,6 +222,17 @@ sb.append(']'); return sb.toString(); } + + @Override + public String toString() { + try { + char utf16[] = new char[length]; + int len = UnicodeUtil.UTF8toUTF16withCheck(bytes, offset, length, utf16); + return new String(utf16, 0, len); + } catch (CharacterCodingException ex) { + return bytesToString(); + } + } public void copy(BytesRef other) { if (bytes.length < other.length) { Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 1075413) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.nio.charset.CharacterCodingException; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -85,6 +87,32 @@ * copyright holder. */ +/* + * Additional code from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ + +/* + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + /** * Class to encode java's UTF16 char[] into UTF8 byte[] * without always allocating a new byte[] as @@ -705,4 +733,85 @@ } return sb.toString(); } + + /* + * DFA decoder from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * + * Probably the only UTF-8 decoder you can trust to reliably reject + * invalid UTF-8 + */ + private static final int UTF8_ACCEPT = 0; + + private static final byte utf8d[] = { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, + }; + + /** + * Decodes from UTF-8 encoding. + * If the bytes are invalid UTF-8, a CharacterCodingException is always thrown. + * + * @param bytes byte array + * @param offset offset into byte array + * @param length valid length of byte array + * @param s character array: should be preallocated to length: worst case 1:1 + * @return valid length of character array + * @throws CharacterCodingException on malformed input (illegal byte sequences) + */ + public static int UTF8toUTF16withCheck(byte bytes[], int offset, int length, char s[]) + throws CharacterCodingException { + int state = UTF8_ACCEPT; + int end = offset + length; + int out = 0; + int cp = 0; + + while (offset < end) { + final int b = bytes[offset++] & 0xff; + + if (state != UTF8_ACCEPT) { + // non-starter, or following illegal byte sequence + final int type = utf8d[b]; + cp = (cp << 6) | (b & 63); + state = utf8d[256 + state + type]; + if (state == UTF8_ACCEPT) // final byte + if (cp <= 0xffff) { // bmp + s[out++] = (char) cp; + } else { // break into surrogates + s[out++] = (char) (0xd7c0 + (cp >> 10)); + s[out++] = (char) (0xdc00 + (cp & 0x3ff)); + } + } else if (b > 0x7f) { + // starter + final int type = utf8d[b]; + cp = b & (255 >> type); + state = utf8d[256 + state + type]; + } else { + // ASCII fast-path + s[out++] = (char) b; + } + } + + if (state != UTF8_ACCEPT) { + // we ended in a non-accept state: illegal + throw new CharacterCodingException(); + } + + return out; + } } Index: lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java =================================================================== --- lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (revision 1075413) +++ lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (working copy) @@ -392,7 +392,7 @@ while((aprioriText = aprioriTermEnum.next()) != null) { assertEquals(aprioriText, testTermEnum.next()); if (VERBOSE) { - System.out.println("TEST: verify term=" + aprioriText.utf8ToString()); + System.out.println("TEST: verify term=" + aprioriText.toString()); } assertTrue(aprioriTermEnum.docFreq() == testTermEnum.docFreq()); Index: lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java (revision 1075413) +++ lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java (working copy) @@ -53,7 +53,7 @@ Terms terms =MultiFields.getTerms(reader, field); long totalTF = HighFreqTerms.getTotalTermFreq(reader, field, termtext); System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", - field, termtext.utf8ToString(), totalTF, terms.docFreq(termtext)); + field, termtext.toString(), totalTF, terms.docFreq(termtext)); } private static void usage() { Index: lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java (revision 1075413) +++ lucene/contrib/misc/src/java/org/apache/lucene/misc/TermStats.java (working copy) @@ -39,11 +39,11 @@ } String getTermText() { - return termtext.utf8ToString(); + return termtext.toString(); } @Override public String toString() { - return("TermStats: term=" + termtext.utf8ToString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq); + return("TermStats: term=" + termtext.toString() + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq); } } \ No newline at end of file Index: lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (revision 1075413) +++ lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (working copy) @@ -80,14 +80,14 @@ //default HighFreqTerms behavior for (int i = 0; i < terms.length; i++) { System.out.printf("%s:%s %,d \n", - terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq); + terms[i].field, terms[i].termtext.toString(), terms[i].docFreq); } } else{ TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms); for (int i = 0; i < termsWithTF.length; i++) { System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", - termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(), + termsWithTF[i].field, termsWithTF[i].termtext.toString(), termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq); } } Index: lucene/contrib/lucli/src/java/lucli/LuceneMethods.java =================================================================== --- lucene/contrib/lucli/src/java/lucli/LuceneMethods.java (revision 1075413) +++ lucene/contrib/lucli/src/java/lucli/LuceneMethods.java (working copy) @@ -356,7 +356,7 @@ //message(term.field() + ":" + term.text() + " freq:" + terms.docFreq()); //if we're either not looking by field or we're matching the specific field if ((field == null) || field.equals(curField)) { - termMap.put(curField + ":" + text.utf8ToString(), Integer.valueOf((terms.docFreq()))); + termMap.put(curField + ":" + text.toString(), Integer.valueOf((terms.docFreq()))); } } }