Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
Impala 2.10.0
-
None
-
ghx-label-9
Description
It appears that this code is double counting into rows_read_counter(), since row_group_rows_read_ is already accumulating:
HdfsParquetScanner::GetNextInternal()
} else if (scan_node_->IsZeroSlotTableScan()) { // There are no materialized slots and we are not optimizing count(*), e.g. // "select 1 from alltypes". We can serve this query from just the file metadata. // We don't need to read the column data. if (row_group_rows_read_ == file_metadata_.num_rows) { eos_ = true; return Status::OK(); } assemble_rows_timer_.Start(); DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows); int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_; int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining); TupleRow* current_row = row_batch->GetRow(row_batch->AddRow()); int num_to_commit = WriteTemplateTuples(current_row, max_tuples); Status status = CommitRows(row_batch, num_to_commit); assemble_rows_timer_.Stop(); RETURN_IF_ERROR(status); row_group_rows_read_ += num_to_commit; COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_); <====== return Status::OK(); }
Repro in impala-shell:
set batch_size=16; set num_nodes=1; select count(*) from functional.alltypesmixedformat; profile .... - RowsRead: 3.94K (3936) - RowsReturned: 1.20K (1200)