[ARROW-14629] [Release][Python] Parquet test fails on AlmaLinux8 - ASF JIRA

Details

Type: Bug
Status: Resolved
Priority: Minor
Resolution: Fixed
Affects Version/s: None
Fix Version/s: 7.0.0
Component/s: Python
Labels:
- pull-request-available

External issue URL:
https://github.com/apache/arrow/issues/30173

Description

When running verification tests on AlmaLinux 8, Parquet test fails

Main steps to reproduce

dnf -y update
dnf clean all
dnf -y install \
  dnf-plugins-core \
  yum-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y module disable ruby
dnf -y module enable ruby:2.7
dnf -y groupinstall "Development Tools"
dnf -y install \
  epel-release \
  ninja-build \
  libcurl-devel \
  python3-pip \
  python3-devel \
  cmake \
  git \
  ncurses-devel \
  gobject-introspection-devel \
  libffi-devel \
  openssl-devel \
  maven \
  java-1.8.0-openjdk-devel \
  wget \
  readline-devel \
  gdbm-devel \
  ruby-devel \
  llvm-toolset \
  llvm-devel
dnf -y update
alias pip=pip3
alternatives --set python /usr/bin/python3
ln -s /usr/bin/pip3 /usr/bin/pip
git clone https://github.com/apache/arrow/
pip install -r arrow/python/requirements-build.txt \
     -r arrow/python/requirements-test.txt
cd arrow
mkdir dist
export ARROW_HOME=$(pwd)/dist
export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
cd cpp
mkdir build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
      -DCMAKE_INSTALL_LIBDIR=lib \
      -DARROW_WITH_BZ2=ON \
      -DARROW_WITH_ZLIB=ON \
      -DARROW_WITH_ZSTD=ON \
      -DARROW_WITH_LZ4=ON \
      -DARROW_WITH_SNAPPY=ON \
      -DARROW_WITH_BROTLI=ON \
      -DARROW_PARQUET=ON \
      -DARROW_PYTHON=ON \
      -DARROW_BUILD_TESTS=ON \
      ..
make -j4
make install
cd ..
cd ..
cd python
export PYARROW_WITH_PARQUET=1
python setup.py build_ext --inplace
export PYARROW_TEST_PARQUET=ON
 python -m pytest -r s --pyargs pyarrow

Resulting error:

============================================ FAILURES =============================================
________________________________ test_permutation_of_column_order _________________________________

source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
memory_map = False, read_dictionary = None
filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
pre_buffer = True, coerce_int96_timestamp_unit = None

    def read_table(source, columns=None, use_threads=True, metadata=None,
                   use_pandas_metadata=False, memory_map=False,
                   read_dictionary=None, filesystem=None, filters=None,
                   buffer_size=0, partitioning="hive", use_legacy_dataset=False,
                   ignore_prefixes=None, pre_buffer=True,
                   coerce_int96_timestamp_unit=None):
        if not use_legacy_dataset:
            if metadata is not None:
                raise ValueError(
                    "The 'metadata' keyword is no longer supported with the new "
                    "datasets-based implementation. Specify "
                    "'use_legacy_dataset=True' to temporarily recover the old "
                    "behaviour."
                )
            try:
                dataset = _ParquetDatasetV2(
                    source,
                    filesystem=filesystem,
                    partitioning=partitioning,
                    memory_map=memory_map,
                    read_dictionary=read_dictionary,
                    buffer_size=buffer_size,
                    filters=filters,
                    ignore_prefixes=ignore_prefixes,
                    pre_buffer=pre_buffer,
>                   coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
                )

pyarrow/parquet.py:1960: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
kwargs = {}

    def __init__(self, path_or_paths, filesystem=None, filters=None,
                 partitioning="hive", read_dictionary=None, buffer_size=None,
                 memory_map=False, ignore_prefixes=None, pre_buffer=True,
                 coerce_int96_timestamp_unit=None, **kwargs):
>       import pyarrow.dataset as ds

pyarrow/parquet.py:1680: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    """Dataset is currently unstable. APIs subject to change without notice."""
    
    import pyarrow as pa
    from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
    
>   from pyarrow._dataset import (  # noqa
        CsvFileFormat,
        CsvFragmentScanOptions,
        Expression,
        Dataset,
        DatasetFactory,
        DirectoryPartitioning,
        FileFormat,
        FileFragment,
        FileSystemDataset,
        FileSystemDatasetFactory,
        FileSystemFactoryOptions,
        FileWriteOptions,
        Fragment,
        HivePartitioning,
        IpcFileFormat,
        IpcFileWriteOptions,
        InMemoryDataset,
        ParquetDatasetFactory,
        ParquetFactoryOptions,
        ParquetFileFormat,
        ParquetFileFragment,
        ParquetFileWriteOptions,
        ParquetFragmentScanOptions,
        ParquetReadOptions,
        Partitioning,
        PartitioningFactory,
        RowGroupInfo,
        Scanner,
        TaggedRecordBatch,
        UnionDataset,
        UnionDatasetFactory,
        _get_partition_keys,
        _filesystemdataset_write,
    )
E   ModuleNotFoundError: No module named 'pyarrow._dataset'

pyarrow/dataset.py:23: ModuleNotFoundError

During handling of the above exception, another exception occurred:

tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')

    def test_permutation_of_column_order(tempdir):
        # ARROW-2366
        case = tempdir / "dataset_column_order_permutation"
        case.mkdir(exist_ok=True)
    
        data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
        pq.write_table(data1, case / "data1.parquet")
    
        data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
        pq.write_table(data2, case / "data2.parquet")
    
>       table = pq.read_table(str(case))

pyarrow/tests/parquet/test_basic.py:645: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyarrow/parquet.py:1977: in read_table
    source = filesystem.open_input_file(path)
pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
    in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
    return check_status(status)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

>   raise IOError(message)
E   OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory

pyarrow/error.pxi:114: OSError

[Release][Python] Parquet test fails on AlmaLinux8

Details

Description

Attachments

Issue Links

Activity

People

Dates

Time Tracking