Details
-
Bug
-
Status: Resolved
-
Minor
-
Resolution: Fixed
-
None
Description
When running verification tests on AlmaLinux 8, Parquet test fails
Main steps to reproduce
dnf -y update
dnf clean all
dnf -y install \
dnf-plugins-core \
yum-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y module disable ruby
dnf -y module enable ruby:2.7
dnf -y groupinstall "Development Tools"
dnf -y install \
epel-release \
ninja-build \
libcurl-devel \
python3-pip \
python3-devel \
cmake \
git \
ncurses-devel \
gobject-introspection-devel \
libffi-devel \
openssl-devel \
maven \
java-1.8.0-openjdk-devel \
wget \
readline-devel \
gdbm-devel \
ruby-devel \
llvm-toolset \
llvm-devel
dnf -y update
alias pip=pip3
alternatives --set python /usr/bin/python3
ln -s /usr/bin/pip3 /usr/bin/pip
git clone https://github.com/apache/arrow/
pip install -r arrow/python/requirements-build.txt \
-r arrow/python/requirements-test.txt
cd arrow
mkdir dist
export ARROW_HOME=$(pwd)/dist
export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
cd cpp
mkdir build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=ON \
..
make -j4
make install
cd ..
cd ..
cd python
export PYARROW_WITH_PARQUET=1
python setup.py build_ext --inplace
export PYARROW_TEST_PARQUET=ON
python -m pytest -r s --pyargs pyarrow
Resulting error:
============================================ FAILURES ============================================= ________________________________ test_permutation_of_column_order _________________________________ source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' columns = None, use_threads = True, metadata = None, use_pandas_metadata = False memory_map = False, read_dictionary = None filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None pre_buffer = True, coerce_int96_timestamp_unit = None def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, buffer_size=0, partitioning="hive", use_legacy_dataset=False, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None): if not use_legacy_dataset: if metadata is not None: raise ValueError( "The 'metadata' keyword is no longer supported with the new " "datasets-based implementation. Specify " "'use_legacy_dataset=True' to temporarily recover the old " "behaviour." ) try: dataset = _ParquetDatasetV2( source, filesystem=filesystem, partitioning=partitioning, memory_map=memory_map, read_dictionary=read_dictionary, buffer_size=buffer_size, filters=filters, ignore_prefixes=ignore_prefixes, pre_buffer=pre_buffer, > coerce_int96_timestamp_unit=coerce_int96_timestamp_unit ) pyarrow/parquet.py:1960: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0> path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0 memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None kwargs = {} def __init__(self, path_or_paths, filesystem=None, filters=None, partitioning="hive", read_dictionary=None, buffer_size=None, memory_map=False, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, **kwargs): > import pyarrow.dataset as ds pyarrow/parquet.py:1680: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ """Dataset is currently unstable. APIs subject to change without notice.""" import pyarrow as pa from pyarrow.util import _is_iterable, _stringify_path, _is_path_like > from pyarrow._dataset import ( # noqa CsvFileFormat, CsvFragmentScanOptions, Expression, Dataset, DatasetFactory, DirectoryPartitioning, FileFormat, FileFragment, FileSystemDataset, FileSystemDatasetFactory, FileSystemFactoryOptions, FileWriteOptions, Fragment, HivePartitioning, IpcFileFormat, IpcFileWriteOptions, InMemoryDataset, ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, ParquetFileFragment, ParquetFileWriteOptions, ParquetFragmentScanOptions, ParquetReadOptions, Partitioning, PartitioningFactory, RowGroupInfo, Scanner, TaggedRecordBatch, UnionDataset, UnionDatasetFactory, _get_partition_keys, _filesystemdataset_write, ) E ModuleNotFoundError: No module named 'pyarrow._dataset' pyarrow/dataset.py:23: ModuleNotFoundError During handling of the above exception, another exception occurred: tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0') def test_permutation_of_column_order(tempdir): # ARROW-2366 case = tempdir / "dataset_column_order_permutation" case.mkdir(exist_ok=True) data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b']) pq.write_table(data1, case / "data1.parquet") data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a']) pq.write_table(data2, case / "data2.parquet") > table = pq.read_table(str(case)) pyarrow/tests/parquet/test_basic.py:645: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pyarrow/parquet.py:1977: in read_table source = filesystem.open_input_file(path) pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file in_handle = GetResultValue(self.fs.OpenInputFile(pathstr)) pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status return check_status(status) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > raise IOError(message) E OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory pyarrow/error.pxi:114: OSError
Attachments
Issue Links
- links to