Details
-
Improvement
-
Status: Open
-
Minor
-
Resolution: Unresolved
-
Impala 2.2
-
None
Description
When writing to text file
HdfsTextTableWriter::AppendRowBatch
consumes 72.9% of CPU�.
HdfsTextTableWriter::AppendRowBatch
would benefit a lot from codegen.
The following loop can be un-rolled
for (int j = 0; j < num_non_partition_cols; ++j) { void* value = output_expr_ctxs_[j]->GetValue(current_row); if (value != NULL) {
Codegen
ExprContext::GetValue
to avoid the expensive case statements
switch (e->type_.type) { case TYPE_BOOLEAN: { impala_udf::BooleanVal v = e->GetBooleanVal(this, row); if (v.is_null) return NULL; result_.bool_val = v.val; return &result_.bool_val; } case TYPE_TINYINT: { impala_udf::TinyIntVal v = e->GetTinyIntVal(this, row); if (v.is_null) return NULL; result_.tinyint_val = v.val; return &result_.tinyint_val; } case TYPE_SMALLINT: { impala_udf::SmallIntVal v = e->GetSmallIntVal(this, row); if (v.is_null) return NULL; result_.smallint_val = v.val; return &result_.smallint_val; }
Ditto for
RawValue::PrintValue
as it goes through
const StringValue* string_val = NULL; switch (type.type) { case TYPE_BOOLEAN: { bool val = *reinterpret_cast<const bool*>(value); *stream << (val ? "true" : "false"); return; } case TYPE_TINYINT: // Extra casting for chars since they should not be interpreted as ASCII. *stream << static_cast<int>(*reinterpret_cast<const int8_t*>(value)); break; case TYPE_SMALLINT: *stream << *reinterpret_cast<const int16_t*>(value); break; case TYPE_INT: *stream << *reinterpret_cast<const int32_t*>(value); break; case TYPE_BIGINT:
Also
HdfsTextTableWriter::PrintEscaped
consumes half of the CPU consumed by AppendRowBatch, we should experiment with unrolling the loop below
�
inline void HdfsTextTableWriter::PrintEscaped(const StringValue* str_val) { for (int i = 0; i < str_val->len; ++i) { if (UNLIKELY(str_val->ptr[i] == field_delim_ || str_val->ptr[i] == escape_char_)) { rowbatch_stringstream_ << escape_char_; } rowbatch_stringstream_ << str_val->ptr[i]; } }
Destination table used
+--------------------------------------------------------------------------------+ | result | +--------------------------------------------------------------------------------+ | CREATE TABLE tpch_parquet.lineitem_text ( | | l_orderkey BIGINT, | | l_partkey BIGINT, | | l_suppkey BIGINT, | | l_linenumber INT, | | l_quantity DECIMAL(12,2), | | l_extendedprice DECIMAL(12,2), | | l_discount DECIMAL(12,2), | | l_tax DECIMAL(12,2), | | l_returnflag STRING, | | l_linestatus STRING, | | l_shipdate STRING, | | l_commitdate STRING, | | l_receiptdate STRING, | | l_shipinstruct STRING, | | l_shipmode STRING, | | l_comment STRING | | ) | | STORED AS TEXTFILE | | LOCATION 'hdfs://localhost:20500/test-warehouse/tpch_parquet.db/lineitem_text' | | TBLPROPERTIES ('transient_lastDdlTime'='1450464194') | +--------------------------------------------------------------------------------+
Query used
insert into lineitem_text select * from tpch_parquet.lineitem;