Details
Description
I'm unable to install the pyspark Python package on Amazon Linux 2, whether in a Docker image or an EMR cluster. Amazon Linux 2 currently ships with Python 3.7 and pip 9.0.3, but upgrading pip yields the same result.
When installing the package, the installation will fail with the error "ValueError: bad marshal data (unknown type code)". Full example stack below.
This bug prevents use of pyspark for simple testing environments, and from using tools where the pyspark package is a dependency, like https://github.com/awslabs/python-deequ.
Stack Trace:
Step 3/3 : RUN pip3 install pyspark==2.4.7
{{ ---> Running in 2c6e1c1de62f}}
WARNING: Running pip install with root privileges is generally not a good idea. Try `pip3 install --user` instead.
Collecting pyspark==2.4.7
{{ Downloading https://files.pythonhosted.org/packages/e2/06/29f80e5a464033432eedf89924e7aa6ebbc47ce4dcd956853a73627f2c07/pyspark-2.4.7.tar.gz (217.9MB)}}
{{ Complete output from command python setup.py egg_info:}}
{{ Could not import pypandoc - required to package PySpark}}
{{ /usr/lib64/python3.7/distutils/dist.py:274: UserWarning: Unknown distribution option: 'long_description_content_type'}}
{{ warnings.warn(msg)}}
{{ zip_safe flag not set; analyzing archive contents...}}
{{ Traceback (most recent call last):}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 154, in save_modules}}
{{ yield saved}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 195, in setup_context}}
{{ yield}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 250, in run_setup}}
{{ _execfile(setup_script, ns)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 45, in _execfile}}
{{ exec(code, globals, locals)}}
{{ File "/tmp/easy_install-l742j64w/pypandoc-1.5/setup.py", line 111, in <module>}}
{{ # using Python imports instead which will be resolved correctly.}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/_init_.py", line 129, in setup}}
{{ return distutils.core.setup(**attrs)}}
{{ File "/usr/lib64/python3.7/distutils/core.py", line 148, in setup}}
{{ dist.run_commands()}}
{{ File "/usr/lib64/python3.7/distutils/dist.py", line 966, in run_commands}}
{{ self.run_command(cmd)}}
{{ File "/usr/lib64/python3.7/distutils/dist.py", line 985, in run_command}}
{{ cmd_obj.run()}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 218, in run}}
{{ os.path.join(archive_root, 'EGG-INFO'), self.zip_safe()}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 269, in zip_safe}}
{{ return analyze_egg(self.bdist_dir, self.stubs)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 379, in analyze_egg}}
{{ safe = scan_module(egg_dir, base, name, stubs) and safe}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 416, in scan_module}}
{{ code = marshal.load(f)}}
{{ ValueError: bad marshal data (unknown type code)}}During handling of the above exception, another exception occurred:Traceback (most recent call last):
{{ File "<string>", line 1, in <module>}}
{{ File "/tmp/pip-build-j3d56a0n/pyspark/setup.py", line 224, in <module>}}
{{ 'Programming Language :: Python :: Implementation :: PyPy']}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/_init_.py", line 128, in setup}}
{{ _install_setup_requires(attrs)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/_init_.py", line 123, in _install_setup_requires}}
{{ dist.fetch_build_eggs(dist.setup_requires)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/dist.py", line 461, in fetch_build_eggs}}
{{ replace_conflicting=True,}}
{{ File "/usr/lib/python3.7/site-packages/pkg_resources/_init_.py", line 866, in resolve}}
{{ replace_conflicting=replace_conflicting}}
{{ File "/usr/lib/python3.7/site-packages/pkg_resources/_init_.py", line 1146, in best_match}}
{{ return self.obtain(req, installer)}}
{{ File "/usr/lib/python3.7/site-packages/pkg_resources/_init_.py", line 1158, in obtain}}
{{ return installer(requirement)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/dist.py", line 528, in fetch_build_egg}}
{{ return cmd.easy_install(req)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/easy_install.py", line 672, in easy_install}}
{{ return self.install_item(spec, dist.location, tmpdir, deps)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/easy_install.py", line 698, in install_item}}
{{ dists = self.install_eggs(spec, download, tmpdir)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/easy_install.py", line 881, in install_eggs}}
{{ return self.build_and_install(setup_script, setup_base)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/easy_install.py", line 1149, in build_and_install}}
{{ self.run_setup(setup_script, setup_base, args)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/easy_install.py", line 1135, in run_setup}}
{{ run_setup(setup_script, args)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 253, in run_setup}}
{{ raise}}
{{ File "/usr/lib64/python3.7/contextlib.py", line 130, in _exit_}}
{{ self.gen.throw(type, value, traceback)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 195, in setup_context}}
{{ yield}}
{{ File "/usr/lib64/python3.7/contextlib.py", line 130, in _exit_}}
{{ self.gen.throw(type, value, traceback)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 166, in save_modules}}
{{ saved_exc.resume()}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 141, in resume}}
{{ six.reraise(type, exc, self._tb)}}
{{ File "/usr/lib/python3.7/site-packages/pkg_resources/_vendor/six.py", line 685, in reraise}}
{{ raise value.with_traceback(tb)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 154, in save_modules}}
{{ yield saved}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 195, in setup_context}}
{{ yield}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 250, in run_setup}}
{{ _execfile(setup_script, ns)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/sandbox.py", line 45, in _execfile}}
{{ exec(code, globals, locals)}}
{{ File "/tmp/easy_install-l742j64w/pypandoc-1.5/setup.py", line 111, in <module>}}
{{ # using Python imports instead which will be resolved correctly.}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/_init_.py", line 129, in setup}}
{{ return distutils.core.setup(**attrs)}}
{{ File "/usr/lib64/python3.7/distutils/core.py", line 148, in setup}}
{{ dist.run_commands()}}
{{ File "/usr/lib64/python3.7/distutils/dist.py", line 966, in run_commands}}
{{ self.run_command(cmd)}}
{{ File "/usr/lib64/python3.7/distutils/dist.py", line 985, in run_command}}
{{ cmd_obj.run()}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 218, in run}}
{{ os.path.join(archive_root, 'EGG-INFO'), self.zip_safe()}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 269, in zip_safe}}
{{ return analyze_egg(self.bdist_dir, self.stubs)}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 379, in analyze_egg}}
{{ safe = scan_module(egg_dir, base, name, stubs) and safe}}
{{ File "/usr/lib/python3.7/site-packages/setuptools/command/bdist_egg.py", line 416, in scan_module}}
{{ code = marshal.load(f)}}
{{ ValueError: bad marshal data (unknown type code)}}