Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-6174

[C++] Validate chunks in ChunkedArray::Validate

    XMLWordPrintableJSON

Details

    Description

      If I patch Table::Validate() to also validate the underlying arrays:

      diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
      index 446010f93..e617470b5 100644
      --- a/cpp/src/arrow/table.cc
      +++ b/cpp/src/arrow/table.cc
      @@ -21,6 +21,7 @@
       #include <cstdlib>
       #include <limits>
       #include <memory>
      +#include <sstream>
       #include <utility>
       
       #include "arrow/array.h"
      @@ -184,10 +185,18 @@ Status ChunkedArray::Validate() const {
         }
       
         const auto& type = *chunks_[0]->type();
      +  // Make sure chunks all have the same type, and validate them
         for (size_t i = 1; i < chunks_.size(); ++i) {
      -    if (!chunks_[i]->type()->Equals(type)) {
      +    const Array& chunk = *chunks_[i];
      +    if (!chunk.type()->Equals(type)) {
             return Status::Invalid("In chunk ", i, " expected type ", type.ToString(),
      -                             " but saw ", chunks_[i]->type()->ToString());
      +                             " but saw ", chunk.type()->ToString());
      +    }
      +    Status st = ValidateArray(chunk);
      +    if (!st.ok()) {
      +      std::stringstream ss;
      +      ss << "Chunk " << i << ": " << st.message();
      +      return st.WithMessage(ss.str());
           }
         }
         return Status::OK();
      @@ -343,7 +352,7 @@ class SimpleTable : public Table {
             }
           }
       
      -    // Make sure columns are all the same length
      +    // Make sure columns are all the same length, and validate them
           for (int i = 0; i < num_columns(); ++i) {
             const ChunkedArray* col = columns_[i].get();
             if (col->length() != num_rows_) {
      @@ -351,6 +360,12 @@ class SimpleTable : public Table {
                                      " expected length ", num_rows_, " but got length ",
                                      col->length());
             }
      +      Status st = col->Validate();
      +      if (!st.ok()) {
      +        std::stringstream ss;
      +        ss << "Column " << i << ": " << st.message();
      +        return st.WithMessage(ss.str());
      +      }
           }
           return Status::OK();
         }
      

      ... then parquet-arrow-test fails and then crashes:

      [...]
      [ RUN      ] TestArrowReadWrite.TableWithChunkedColumns
      ../src/parquet/arrow/arrow-reader-writer-test.cc:347: Failure
      Failed
      'WriteTable(*table, ::arrow::default_memory_pool(), sink, row_group_size, default_writer_properties(), arrow_properties)' failed with Invalid: Column 0: Chunk 1: Final offset invariant not equal to values length: 210!=733
      In ../src/arrow/array.cc, line 1229, code: ValidateListArray(array)
      In ../src/parquet/arrow/writer.cc, line 1210, code: table.Validate()
      In ../src/parquet/arrow/writer.cc, line 1252, code: writer->WriteTable(table, chunk_size)
      ../src/parquet/arrow/arrow-reader-writer-test.cc:419: Failure
      Expected: WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer) doesn't generate new fatal failures in the current thread.
        Actual: it does.
      /home/antoine/arrow/dev/cpp/build-support/run-test.sh : ligne 97 : 28927 Erreur de segmentation  $TEST_EXECUTABLE "$@" 2>&1
           28930 Fini                    | $ROOT/build-support/asan_symbolize.py
           28933 Fini                    | ${CXXFILT:-c++filt}
           28936 Fini                    | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE
           28939 Fini                    | $pipe_cmd 2>&1
           28941 Fini                    | tee $LOGFILE
      ~/arrow/dev/cpp/build-test/src/parquet
      
      

      Attachments

        Issue Links

          Activity

            People

              wesm Wes McKinney
              apitrou Antoine Pitrou
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Time Tracking

                  Estimated:
                  Original Estimate - Not Specified
                  Not Specified
                  Remaining:
                  Remaining Estimate - 0h
                  0h
                  Logged:
                  Time Spent - 0.5h
                  0.5h