Details
-
Bug
-
Status: Resolved
-
Critical
-
Resolution: Fixed
-
0.8.0, 0.9.0
Description
When writing multiple batches to a Stream/File Writer, the first validity bit can get garbled between writing and reading. I couldn't pinpoint the exact issue, but I was able to re-create it with a fairly simple unit test.
in TestArrowStream.java:
@Test public void testReadWriteMultipleBatches() throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); try (IntVector vector = new IntVector("foo", allocator);) { Schema schema = new Schema(Collections.singletonList(vector.getField()), null); try (VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount()); ArrowStreamWriter writer = new ArrowStreamWriter(root, new MapDictionaryProvider(), Channels.newChannel(os));) { writer.start(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setNull(3); vector.setSafe(4, 1); vector.setValueCount(5); root.setRowCount(5); writer.writeBatch(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setValueCount(3); root.setRowCount(3); writer.writeBatch(); } } ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray()); try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) { IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); reader.loadNextBatch(); assertEquals(read.getValueCount(), 5); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); assertNull(read.getObject(3)); assertEquals(read.getObject(4), Integer.valueOf(1)); reader.loadNextBatch(); assertEquals(read.getValueCount(), 3); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); } }
in TestArrowFile.java:
@Test public void testReadWriteMultipleBatches() throws IOException { File file = new File("target/mytest_nulls_multibatch.arrow"); try (IntVector vector = new IntVector("foo", allocator);) { Schema schema = new Schema(Collections.singletonList(vector.getField()), null); try (FileOutputStream fileOutputStream = new FileOutputStream(file); VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount()); ArrowFileWriter writer = new ArrowFileWriter(root, new MapDictionaryProvider(), fileOutputStream.getChannel());) { writer.start(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setNull(3); vector.setSafe(4, 1); vector.setValueCount(5); root.setRowCount(5); writer.writeBatch(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setValueCount(3); root.setRowCount(3); writer.writeBatch(); } } try (FileInputStream fileInputStream = new FileInputStream(file); ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator);) { IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); reader.loadNextBatch(); assertEquals(read.getValueCount(), 5); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); assertNull(read.getObject(3)); assertEquals(read.getObject(4), Integer.valueOf(1)); reader.loadNextBatch(); assertEquals(read.getValueCount(), 3); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); } }
Attachments
Issue Links
- links to