Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-14629

[Release][Python] Parquet test fails on AlmaLinux8

    XMLWordPrintableJSON

Details

    Description

      When running verification tests on AlmaLinux 8, Parquet test fails

      Main steps to reproduce

      dnf -y update
      dnf clean all
      dnf -y install \
        dnf-plugins-core \
        yum-utils
      dnf config-manager --set-enabled powertools
      dnf -y update
      dnf -y module disable ruby
      dnf -y module enable ruby:2.7
      dnf -y groupinstall "Development Tools"
      dnf -y install \
        epel-release \
        ninja-build \
        libcurl-devel \
        python3-pip \
        python3-devel \
        cmake \
        git \
        ncurses-devel \
        gobject-introspection-devel \
        libffi-devel \
        openssl-devel \
        maven \
        java-1.8.0-openjdk-devel \
        wget \
        readline-devel \
        gdbm-devel \
        ruby-devel \
        llvm-toolset \
        llvm-devel
      dnf -y update
      alias pip=pip3
      alternatives --set python /usr/bin/python3
      ln -s /usr/bin/pip3 /usr/bin/pip
      git clone https://github.com/apache/arrow/
      pip install -r arrow/python/requirements-build.txt \
           -r arrow/python/requirements-test.txt
      cd arrow
      mkdir dist
      export ARROW_HOME=$(pwd)/dist
      export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
      cd cpp
      mkdir build
      cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
            -DCMAKE_INSTALL_LIBDIR=lib \
            -DARROW_WITH_BZ2=ON \
            -DARROW_WITH_ZLIB=ON \
            -DARROW_WITH_ZSTD=ON \
            -DARROW_WITH_LZ4=ON \
            -DARROW_WITH_SNAPPY=ON \
            -DARROW_WITH_BROTLI=ON \
            -DARROW_PARQUET=ON \
            -DARROW_PYTHON=ON \
            -DARROW_BUILD_TESTS=ON \
            ..
      make -j4
      make install
      cd ..
      cd ..
      cd python
      export PYARROW_WITH_PARQUET=1
      python setup.py build_ext --inplace
      export PYARROW_TEST_PARQUET=ON
       python -m pytest -r s --pyargs pyarrow
      

      Resulting error:

      ============================================ FAILURES =============================================
      ________________________________ test_permutation_of_column_order _________________________________
      
      source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
      columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
      memory_map = False, read_dictionary = None
      filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
      buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
      pre_buffer = True, coerce_int96_timestamp_unit = None
      
          def read_table(source, columns=None, use_threads=True, metadata=None,
                         use_pandas_metadata=False, memory_map=False,
                         read_dictionary=None, filesystem=None, filters=None,
                         buffer_size=0, partitioning="hive", use_legacy_dataset=False,
                         ignore_prefixes=None, pre_buffer=True,
                         coerce_int96_timestamp_unit=None):
              if not use_legacy_dataset:
                  if metadata is not None:
                      raise ValueError(
                          "The 'metadata' keyword is no longer supported with the new "
                          "datasets-based implementation. Specify "
                          "'use_legacy_dataset=True' to temporarily recover the old "
                          "behaviour."
                      )
                  try:
                      dataset = _ParquetDatasetV2(
                          source,
                          filesystem=filesystem,
                          partitioning=partitioning,
                          memory_map=memory_map,
                          read_dictionary=read_dictionary,
                          buffer_size=buffer_size,
                          filters=filters,
                          ignore_prefixes=ignore_prefixes,
                          pre_buffer=pre_buffer,
      >                   coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
                      )
      
      pyarrow/parquet.py:1960: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
      
      self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
      path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
      filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
      memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
      kwargs = {}
      
          def __init__(self, path_or_paths, filesystem=None, filters=None,
                       partitioning="hive", read_dictionary=None, buffer_size=None,
                       memory_map=False, ignore_prefixes=None, pre_buffer=True,
                       coerce_int96_timestamp_unit=None, **kwargs):
      >       import pyarrow.dataset as ds
      
      pyarrow/parquet.py:1680: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
      
          """Dataset is currently unstable. APIs subject to change without notice."""
          
          import pyarrow as pa
          from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
          
      >   from pyarrow._dataset import (  # noqa
              CsvFileFormat,
              CsvFragmentScanOptions,
              Expression,
              Dataset,
              DatasetFactory,
              DirectoryPartitioning,
              FileFormat,
              FileFragment,
              FileSystemDataset,
              FileSystemDatasetFactory,
              FileSystemFactoryOptions,
              FileWriteOptions,
              Fragment,
              HivePartitioning,
              IpcFileFormat,
              IpcFileWriteOptions,
              InMemoryDataset,
              ParquetDatasetFactory,
              ParquetFactoryOptions,
              ParquetFileFormat,
              ParquetFileFragment,
              ParquetFileWriteOptions,
              ParquetFragmentScanOptions,
              ParquetReadOptions,
              Partitioning,
              PartitioningFactory,
              RowGroupInfo,
              Scanner,
              TaggedRecordBatch,
              UnionDataset,
              UnionDatasetFactory,
              _get_partition_keys,
              _filesystemdataset_write,
          )
      E   ModuleNotFoundError: No module named 'pyarrow._dataset'
      
      pyarrow/dataset.py:23: ModuleNotFoundError
      
      During handling of the above exception, another exception occurred:
      
      tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')
      
          def test_permutation_of_column_order(tempdir):
              # ARROW-2366
              case = tempdir / "dataset_column_order_permutation"
              case.mkdir(exist_ok=True)
          
              data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
              pq.write_table(data1, case / "data1.parquet")
          
              data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
              pq.write_table(data2, case / "data2.parquet")
          
      >       table = pq.read_table(str(case))
      
      pyarrow/tests/parquet/test_basic.py:645: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
      pyarrow/parquet.py:1977: in read_table
          source = filesystem.open_input_file(path)
      pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
          in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
      pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
          return check_status(status)
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
      
      >   raise IOError(message)
      E   OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory
      
      pyarrow/error.pxi:114: OSError
      
      

      Attachments

        Issue Links

          Activity

            People

              jorisvandenbossche Joris Van den Bossche
              baksmj Benson Muite
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Time Tracking

                  Estimated:
                  Original Estimate - Not Specified
                  Not Specified
                  Remaining:
                  Remaining Estimate - 0h
                  0h
                  Logged:
                  Time Spent - 1h 20m
                  1h 20m