Details
-
Bug
-
Status: Resolved
-
Critical
-
Resolution: Fixed
-
None
Description
There is a bug in the doubles column statistics computation when writing to parquet an array with only NaNs and nulls. It loops endlessly if the last cell of a write group is a Null. The line in error is https://github.com/apache/arrow/blob/master/cpp/src/parquet/statistics.cc#L633 which checks for NaN but not for Null. Code then falls through and loops endlessly and causes the program to appear frozen.
This code snippet repeats:
TEST(parquet, nans) { /* Create a small parquet structure */ std::vector<std::shared_ptr<::arrow::Field>> fields; fields.push_back(::arrow::field("doubles", ::arrow::float64())); std::shared_ptr<::arrow::Schema> schema = ::arrow::schema(std::move(fields)); std::unique_ptr<::arrow::RecordBatchBuilder> builder; ::arrow::RecordBatchBuilder::Make(schema, ::arrow::default_memory_pool(), &builder); builder->GetFieldAs<::arrow::DoubleBuilder>(0)->Append(std::numeric_limits<double>::quiet_NaN()); builder->GetFieldAs<::arrow::DoubleBuilder>(0)->AppendNull(); std::shared_ptr<::arrow::RecordBatch> batch; builder->Flush(&batch); arrow::PrettyPrint(*batch, 0, &std::cout); std::shared_ptr<arrow::Table> table; arrow::Table::FromRecordBatches({batch}, &table); /* Attempt to write */ std::shared_ptr<::arrow::io::FileOutputStream> os; arrow::io::FileOutputStream::Open("/tmp/test.parquet", &os); parquet::WriterProperties::Builder writer_props_bld; // writer_props_bld.disable_statistics("doubles"); std::shared_ptr<parquet::WriterProperties> writer_props = writer_props_bld.build(); std::shared_ptr<parquet::ArrowWriterProperties> arrow_props = parquet::ArrowWriterProperties::Builder().store_schema()->build(); std::unique_ptr<parquet::arrow::FileWriter> writer; parquet::arrow::FileWriter::Open( *table->schema(), arrow::default_memory_pool(), os, writer_props, arrow_props, &writer); writer->WriteTable(*table, 1024); }
Attachments
Issue Links
- links to