Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
None
-
None
Description
see new failure
=================================== FAILURES =================================== ___________________ test_timeseries_nulls_in_schema[pyarrow] ___________________ tmpdir = local('/tmp/pytest-of-root/pytest-0/test_timeseries_nulls_in_schem0') engine = 'pyarrow' def test_timeseries_nulls_in_schema(tmpdir, engine): tmp_path = str(tmpdir) ddf2 = ( dask.datasets.timeseries(start="2000-01-01", end="2000-01-03", freq="1h") .reset_index() .map_partitions(lambda x: x.loc[:5]) ) ddf2 = ddf2.set_index("x").reset_index().persist() ddf2.name = ddf2.name.where(ddf2.timestamp == "2000-01-01", None) ddf2.to_parquet(tmp_path, engine=engine) ddf_read = dd.read_parquet(tmp_path, engine=engine) assert_eq(ddf_read, ddf2, check_divisions=False, check_index=False) # Can force schema validation on each partition in pyarrow if engine == "pyarrow": # The schema mismatch should raise an error with pytest.raises(ValueError): ddf_read = dd.read_parquet( tmp_path, dataset={"validate_schema": True}, engine=engine ) # There should be no error if you specify a schema on write schema = pa.schema( [ ("x", pa.float64()), ("timestamp", pa.timestamp("ns")), ("id", pa.int64()), ("name", pa.string()), ("y", pa.float64()), ] ) ddf2.to_parquet(tmp_path, schema=schema, engine=engine) assert_eq( > dd.read_parquet(tmp_path, dataset={"validate_schema": True}, engine=engine), ddf2, check_divisions=False, check_index=False, ) opt/conda/lib/python3.6/site-packages/dask/dataframe/io/tests/test_parquet.py:1964: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:190: in read_parquet out = sorted_columns(statistics) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ statistics = ({'columns': [{'max': -0.25838390663957256, 'min': -0.979681447427093, 'name': 'x', 'null_count': 0}, {'max': Timestam...ull_count': 0}, {'max': 0.8978352477516438, 'min': -0.7218571212693894, 'name': 'y', 'null_count': 0}], 'num-rows': 7}) def sorted_columns(statistics): """ Find sorted columns given row-group statistics This finds all columns that are sorted, along with appropriate divisions values for those columns Returns ------- out: List of {'name': str, 'divisions': List[str]} dictionaries """ if not statistics: return [] out = [] for i, c in enumerate(statistics[0]["columns"]): if not all( "min" in s["columns"][i] and "max" in s["columns"][i] for s in statistics ): continue divisions = [c["min"]] max = c["max"] success = True for stats in statistics[1:]: c = stats["columns"][i] > if c["min"] >= max: E TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str' opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:570: TypeError