Details
Description
When using C++ (or Python) to construct a null or empty outer array of type array_1: list<item: struct<array_sub_col: list<item: string>>>, either:
- array_1: null - array_1: []
an out of bounds exceptions (see stack trace below) follows when later retrieving the field reader for the inner list (array_sub_col) in Java, when trying to access the subsequent offset buffer: https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java#L64
Reproduction
Java: 7.0.0
C++: 7.0.0
Python: 7.0.0
Creating a stream on C++ of type array_1: list<item: struct<array_sub_col: list<item: string>>> with an empty (or null) outer list:
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer = arrow::io::BufferOutputStream::Create(1, pool); std::vector<std::shared_ptr<arrow::Field>> inner_list_field{std::make_shared<arrow::Field>("array_sub_col", arrow::list(arrow::utf8()))}; // Datatype for the builder: list<struct<list<string>>> std::shared_ptr<DataType> data_type = list(struct_(inner_list_field)); std::unique_ptr<arrow::ArrayBuilder> builder; arrow::MakeBuilder(pool, data_type, &builder); auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get()); // Append a null or an empty list to the outer list list_builder->AppendNull(); // or list_builder->AppendEmptyValue() std::vector<std::shared_ptr<arrow::Array>> value_batch; value_batch.resize(1); list_builder->Finish(&value_batch[0]); std::vector<std::shared_ptr<arrow::Field>> outer_list_field{std::make_shared<arrow::Field>("array_1",data_type)}; auto schema = std::make_shared<arrow::Schema>(outer_list_field); // Build a single row record batch std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, value_batch); ASSERT_OK(batch->Validate()); // Stream the batch to a file to later read on the Java side arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer = arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, arrow::ipc::IpcWriteOptions::Defaults()); stream_writer.ValueOrDie()->WriteRecordBatch(*batch); arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = stream_buffer.ValueOrDie()->Finish(); std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie(); auto file_output = arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie(); file_output->Write(buffer->data(), buffer->size()); file_output->Close();
As expected, Python holds the same memory layout for the field vectors as the C++ code above:
array = pa.array([None], type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))]))) batch = pa.record_batch([struct_array], names=["array_1"]) sink = pa.BufferOutputStream() with pa.ipc.new_stream(sink, batch.schema) as writer: writer.write_batch(batch) buf = sink.getvalue() with open('/tmp/batch_stream.out', 'wb') as f: f.write(buf)
Java fails when then trying to access the inner list's field reader:
File file = new File("/tmp/batch_stream.out"); byte[] bytes = FileUtils.readFileToByteArray(file); try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(bytes), allocator)) { Schema schema = reader.getVectorSchemaRoot().getSchema(); reader.loadNextBatch(); readBatch.getVector("array_1").getReader().reader().reader("array_sub_col"); // <- fails: reader("array_sub_col") fails with OOB // Concrete readers: // FieldVector array_1 = readBatch.getVector("array_1"); // UnionListReader array_1_reader = (UnionListReader) array_1.getReader(); // NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) array_1_reader.reader(); // FieldReader union_list_reader = struct_reader.reader("array_sub_col"); // <- fails: OOB
Stack trace:
java.lang.reflect.InvocationTargetException at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method) at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62) at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke (Method.java:566) at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297) at java.lang.Thread.run (Thread.java:829) Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 (expected: range(0, 4)) at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318) at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305) at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424) at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433) at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440) at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method) at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62) at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke (Method.java:566) at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297) at java.lang.Thread.run (Thread.java:829)