Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-6623

[CI][Python] Dask docker integration test broken perhaps by statistics-related change

    XMLWordPrintableJSON

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: None
    • Fix Version/s: 0.15.0
    • Component/s: Python
    • Labels:
      None

      Description

      see new failure

      https://circleci.com/gh/ursa-labs/crossbow/3027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link

      =================================== FAILURES ===================================
      ___________________ test_timeseries_nulls_in_schema[pyarrow] ___________________
      
      tmpdir = local('/tmp/pytest-of-root/pytest-0/test_timeseries_nulls_in_schem0')
      engine = 'pyarrow'
      
          def test_timeseries_nulls_in_schema(tmpdir, engine):
              tmp_path = str(tmpdir)
              ddf2 = (
                  dask.datasets.timeseries(start="2000-01-01", end="2000-01-03", freq="1h")
                  .reset_index()
                  .map_partitions(lambda x: x.loc[:5])
              )
              ddf2 = ddf2.set_index("x").reset_index().persist()
              ddf2.name = ddf2.name.where(ddf2.timestamp == "2000-01-01", None)
          
              ddf2.to_parquet(tmp_path, engine=engine)
              ddf_read = dd.read_parquet(tmp_path, engine=engine)
          
              assert_eq(ddf_read, ddf2, check_divisions=False, check_index=False)
          
              # Can force schema validation on each partition in pyarrow
              if engine == "pyarrow":
                  # The schema mismatch should raise an error
                  with pytest.raises(ValueError):
                      ddf_read = dd.read_parquet(
                          tmp_path, dataset={"validate_schema": True}, engine=engine
                      )
                  # There should be no error if you specify a schema on write
                  schema = pa.schema(
                      [
                          ("x", pa.float64()),
                          ("timestamp", pa.timestamp("ns")),
                          ("id", pa.int64()),
                          ("name", pa.string()),
                          ("y", pa.float64()),
                      ]
                  )
                  ddf2.to_parquet(tmp_path, schema=schema, engine=engine)
                  assert_eq(
      >               dd.read_parquet(tmp_path, dataset={"validate_schema": True}, engine=engine),
                      ddf2,
                      check_divisions=False,
                      check_index=False,
                  )
      
      opt/conda/lib/python3.6/site-packages/dask/dataframe/io/tests/test_parquet.py:1964: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:190: in read_parquet
          out = sorted_columns(statistics)
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      statistics = ({'columns': [{'max': -0.25838390663957256, 'min': -0.979681447427093, 'name': 'x', 'null_count': 0}, {'max': Timestam...ull_count': 0}, {'max': 0.8978352477516438, 'min': -0.7218571212693894, 'name': 'y', 'null_count': 0}], 'num-rows': 7})
      
          def sorted_columns(statistics):
              """ Find sorted columns given row-group statistics
          
              This finds all columns that are sorted, along with appropriate divisions
              values for those columns
          
              Returns
              -------
              out: List of {'name': str, 'divisions': List[str]} dictionaries
              """
              if not statistics:
                  return []
          
              out = []
              for i, c in enumerate(statistics[0]["columns"]):
                  if not all(
                      "min" in s["columns"][i] and "max" in s["columns"][i] for s in statistics
                  ):
                      continue
                  divisions = [c["min"]]
                  max = c["max"]
                  success = True
                  for stats in statistics[1:]:
                      c = stats["columns"][i]
      >               if c["min"] >= max:
      E               TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str'
      
      opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:570: TypeError
      

        Attachments

          Activity

            People

            • Assignee:
              jorisvandenbossche Joris Van den Bossche
              Reporter:
              wesm Wes McKinney
            • Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: