Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-15971

[C++/Java] Error when reading inner lists within a struct in empty outer lists from C++/Python in Java

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 7.0.0
    • None
    • C++, Java, Python

    Description

      When using C++ (or Python) to construct a null or empty outer array of type array_1: list<item: struct<array_sub_col: list<item: string>>>, either:

      -  array_1: null
      -  array_1: []
      

      an out of bounds exceptions (see stack trace below) follows when later retrieving the field reader for the inner list (array_sub_col) in Java, when trying to access the subsequent offset buffer: https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java#L64

      Reproduction

      Java: 7.0.0
      C++: 7.0.0
      Python: 7.0.0

      Creating a stream on C++ of type array_1: list<item: struct<array_sub_col: list<item: string>>> with an empty (or null) outer list:

      arrow::MemoryPool* pool = arrow::default_memory_pool();
      arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer =
          arrow::io::BufferOutputStream::Create(1, pool);
      
      std::vector<std::shared_ptr<arrow::Field>> inner_list_field{std::make_shared<arrow::Field>("array_sub_col", arrow::list(arrow::utf8()))};
      
      // Datatype for the builder: list<struct<list<string>>>
      std::shared_ptr<DataType> data_type = list(struct_(inner_list_field));
      
      std::unique_ptr<arrow::ArrayBuilder> builder;
      arrow::MakeBuilder(pool, data_type, &builder);
      auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get());
      
      // Append a null or an empty list to the outer list
      list_builder->AppendNull(); // or list_builder->AppendEmptyValue()
      
      std::vector<std::shared_ptr<arrow::Array>> value_batch;
      value_batch.resize(1);
      list_builder->Finish(&value_batch[0]);
      
      std::vector<std::shared_ptr<arrow::Field>> outer_list_field{std::make_shared<arrow::Field>("array_1",data_type)};
      auto schema = std::make_shared<arrow::Schema>(outer_list_field);
      
      // Build a single row record batch
      std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, value_batch);
      ASSERT_OK(batch->Validate());
      
      // Stream the batch to a file to later read on the Java side
      arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer = 
          arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, arrow::ipc::IpcWriteOptions::Defaults());
      stream_writer.ValueOrDie()->WriteRecordBatch(*batch);
      
      arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = stream_buffer.ValueOrDie()->Finish();
      std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie();
      auto file_output = arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie();
      file_output->Write(buffer->data(), buffer->size());
      file_output->Close();
      

      As expected, Python holds the same memory layout for the field vectors as the C++ code above:

      array = pa.array([None], type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))])))
      batch = pa.record_batch([struct_array], names=["array_1"])
      
      sink = pa.BufferOutputStream()
      with pa.ipc.new_stream(sink, batch.schema) as writer:
          writer.write_batch(batch)
      buf = sink.getvalue()
      
      with open('/tmp/batch_stream.out', 'wb') as f:
          f.write(buf)
      

      Java fails when then trying to access the inner list's field reader:

      File file = new File("/tmp/batch_stream.out");
      byte[] bytes = FileUtils.readFileToByteArray(file);
      try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(bytes), allocator)) {
           Schema schema = reader.getVectorSchemaRoot().getSchema();
           reader.loadNextBatch();
           readBatch.getVector("array_1").getReader().reader().reader("array_sub_col");                     // <- fails: reader("array_sub_col") fails with OOB
      
           // Concrete readers:
           // FieldVector array_1 = readBatch.getVector("array_1");
           // UnionListReader array_1_reader = (UnionListReader) array_1.getReader();
           // NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) array_1_reader.reader();
           // FieldReader union_list_reader = struct_reader.reader("array_sub_col");                        // <- fails: OOB
      

      Stack trace:

      java.lang.reflect.InvocationTargetException
          at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
          at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62)
          at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43)
          at java.lang.reflect.Method.invoke (Method.java:566)
          at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
          at java.lang.Thread.run (Thread.java:829)
      Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 (expected: range(0, 4))
          at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318)
          at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305)
          at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424)
          at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433)
          at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440)
          at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
          at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62)
          at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43)
          at java.lang.reflect.Method.invoke (Method.java:566)
          at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
          at java.lang.Thread.run (Thread.java:829)
      

      Attachments

        Activity

          People

            Unassigned Unassigned
            ArrowUser Arrow User
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated: