Details
-
Bug
-
Status: Resolved
-
Blocker
-
Resolution: Fixed
-
None
-
ghx-label-2
Description
idx isn't updated in case we skip a duplicated or stale duplicated update of a fragment instance. As a result, we may end up passing the wrong profile to instance_stats->Update(). This may lead to random crashes in Coordinator::BackendState::InstanceStats::Update.
int idx = 0; const bool has_profile = thrift_profiles.profile_trees.size() > 0; TRuntimeProfileTree empty_profile; for (const FragmentInstanceExecStatusPB& instance_exec_status : backend_exec_status.instance_exec_status()) { int64_t report_seq_no = instance_exec_status.report_seq_no(); int instance_idx = GetInstanceIdx(instance_exec_status.fragment_instance_id()); DCHECK_EQ(instance_stats_map_.count(instance_idx), 1); InstanceStats* instance_stats = instance_stats_map_[instance_idx]; int64_t last_report_seq_no = instance_stats->last_report_seq_no_; DCHECK(instance_stats->exec_params_.instance_id == ProtoToQueryId(instance_exec_status.fragment_instance_id())); // Ignore duplicate or out-of-order messages. if (report_seq_no <= last_report_seq_no) { VLOG_QUERY << Substitute("Ignoring stale update for query instance $0 with " "seq no $1", PrintId(instance_stats->exec_params_.instance_id), report_seq_no); continue; <<--- // XXX bad } DCHECK(!instance_stats->done_); DCHECK(!has_profile || idx < thrift_profiles.profile_trees.size()); const TRuntimeProfileTree& profile = has_profile ? thrift_profiles.profile_trees[idx++] : empty_profile; instance_stats->Update(instance_exec_status, profile, exec_summary, scan_range_progress);