Uploaded image for project: 'IMPALA'
  1. IMPALA
  2. IMPALA-2785

Hdfs Text Table Writer : codegen AppendRowBatch

    XMLWordPrintableJSON

    Details

    • Type: Improvement
    • Status: Open
    • Priority: Minor
    • Resolution: Unresolved
    • Affects Version/s: Impala 2.2
    • Fix Version/s: None
    • Component/s: Backend

      Description

      When writing to text file

      HdfsTextTableWriter::AppendRowBatch

      consumes 72.9% of CPU�.

      HdfsTextTableWriter::AppendRowBatch

      would benefit a lot from codegen.

      The following loop can be un-rolled

           for (int j = 0; j < num_non_partition_cols; ++j) {
               void* value = output_expr_ctxs_[j]->GetValue(current_row);
               if (value != NULL) {
      

      Codegen

      ExprContext::GetValue

      to avoid the expensive case statements

      switch (e->type_.type) {
          case TYPE_BOOLEAN: {
            impala_udf::BooleanVal v = e->GetBooleanVal(this, row);
            if (v.is_null) return NULL;
            result_.bool_val = v.val;
            return &result_.bool_val;
          }
          case TYPE_TINYINT: {
            impala_udf::TinyIntVal v = e->GetTinyIntVal(this, row);
            if (v.is_null) return NULL;
            result_.tinyint_val = v.val;
            return &result_.tinyint_val;
          }
          case TYPE_SMALLINT: {
            impala_udf::SmallIntVal v = e->GetSmallIntVal(this, row);
            if (v.is_null) return NULL;
            result_.smallint_val = v.val;
            return &result_.smallint_val;
          }
      

      Ditto for

      RawValue::PrintValue

      as it goes through

       const StringValue* string_val = NULL;
        switch (type.type) {
          case TYPE_BOOLEAN: {
            bool val = *reinterpret_cast<const bool*>(value);
            *stream << (val ? "true" : "false");
            return;
          }
          case TYPE_TINYINT:
            // Extra casting for chars since they should not be interpreted as ASCII.
            *stream << static_cast<int>(*reinterpret_cast<const int8_t*>(value));
            break;
          case TYPE_SMALLINT:
            *stream << *reinterpret_cast<const int16_t*>(value);
            break;
          case TYPE_INT:
            *stream << *reinterpret_cast<const int32_t*>(value);
            break;
          case TYPE_BIGINT:
      

      Also

      HdfsTextTableWriter::PrintEscaped

      consumes half of the CPU consumed by AppendRowBatch, we should experiment with unrolling the loop below

      inline void HdfsTextTableWriter::PrintEscaped(const StringValue* str_val) {
        for (int i = 0; i < str_val->len; ++i) {
          if (UNLIKELY(str_val->ptr[i] == field_delim_ || str_val->ptr[i] == escape_char_)) {
            rowbatch_stringstream_ << escape_char_;
          }
          rowbatch_stringstream_ << str_val->ptr[i];
        }
      }
      

      Destination table used

      +--------------------------------------------------------------------------------+
      | result                                                                         |
      +--------------------------------------------------------------------------------+
      | CREATE TABLE tpch_parquet.lineitem_text (                                      |
      |   l_orderkey BIGINT,                                                           |
      |   l_partkey BIGINT,                                                            |
      |   l_suppkey BIGINT,                                                            |
      |   l_linenumber INT,                                                            |
      |   l_quantity DECIMAL(12,2),                                                    |
      |   l_extendedprice DECIMAL(12,2),                                               |
      |   l_discount DECIMAL(12,2),                                                    |
      |   l_tax DECIMAL(12,2),                                                         |
      |   l_returnflag STRING,                                                         |
      |   l_linestatus STRING,                                                         |
      |   l_shipdate STRING,                                                           |
      |   l_commitdate STRING,                                                         |
      |   l_receiptdate STRING,                                                        |
      |   l_shipinstruct STRING,                                                       |
      |   l_shipmode STRING,                                                           |
      |   l_comment STRING                                                             |
      | )                                                                              |
      | STORED AS TEXTFILE                                                             |
      | LOCATION 'hdfs://localhost:20500/test-warehouse/tpch_parquet.db/lineitem_text' |
      | TBLPROPERTIES ('transient_lastDdlTime'='1450464194')                           |
      +--------------------------------------------------------------------------------+
      

      Query used

       insert into lineitem_text select * from tpch_parquet.lineitem;
      

        Attachments

          Activity

            People

            • Assignee:
              Unassigned
              Reporter:
              mmokhtar Mostafa Mokhtar
            • Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

              • Created:
                Updated: