diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 3262887..a24b793 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -213,6 +213,7 @@ import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.Utils; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.security.UserGroupInformation; @@ -730,6 +731,15 @@ private String processTable(QB qb, ASTNode tabref) throws SemanticException { } /** + * Convert a string to Text format and write its bytes in the same way TextOutputFormat would do. + * This is needed to properly encode non-ascii characters. + */ + private static void writeAsText(String text, FSDataOutputStream out) throws IOException { + Text to = new Text(text); + out.write(to.getBytes(), 0, to.getLength()); + } + + /** * Generate a temp table out of a value clause * See also {@link #preProcessForInsert(ASTNode, QB)} */ @@ -807,10 +817,10 @@ private ASTNode genValuesTempTable(ASTNode originalFrom, QB qb) throws SemanticE fields.add(new FieldSchema("tmp_values_col" + nextColNum++, "string", "")); } if (isFirst) isFirst = false; - else out.writeBytes("\u0001"); - out.writeBytes(unparseExprForValuesClause(value)); + else writeAsText("\u0001", out); + writeAsText(unparseExprForValuesClause(value), out); } - out.writeBytes("\n"); + writeAsText("\n", out); firstRow = false; } out.close(); diff --git ql/src/test/queries/clientpositive/insert_values_nonascii.q ql/src/test/queries/clientpositive/insert_values_nonascii.q new file mode 100644 index 0000000..2e4ef41 --- /dev/null +++ ql/src/test/queries/clientpositive/insert_values_nonascii.q @@ -0,0 +1,9 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; + +create table insert_values_nonascii(t1 char(32), t2 string); + +insert into insert_values_nonascii values("Абвгде Garçu 谢谢", "Kôkaku ありがとう"), ("ございます", "kidôtai한국어"); + +select * from insert_values_nonascii; diff --git ql/src/test/results/clientpositive/insert_values_nonascii.q.out ql/src/test/results/clientpositive/insert_values_nonascii.q.out new file mode 100644 index 0000000..ca07bef --- /dev/null +++ ql/src/test/results/clientpositive/insert_values_nonascii.q.out @@ -0,0 +1,28 @@ +PREHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_values_nonascii +POSTHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_values_nonascii +PREHOOK: query: insert into insert_values_nonascii values("Абвгде Garçu 谢谢", "Kôkaku ありがとう"), ("ございます", "kidôtai한국어") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@insert_values_nonascii +POSTHOOK: query: insert into insert_values_nonascii values("Абвгде Garçu 谢谢", "Kôkaku ありがとう"), ("ございます", "kidôtai한국어") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@insert_values_nonascii +POSTHOOK: Lineage: insert_values_nonascii.t1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: insert_values_nonascii.t2 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: select * from insert_values_nonascii +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_values_nonascii +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_values_nonascii +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_values_nonascii +#### A masked pattern was here #### +Абвгде Garçu 谢谢 Kôkaku ありがとう +ございます kidôtai한국어