From 828e6e92db2fa12d9c83ccc51f3ae139b9f33052 Mon Sep 17 00:00:00 2001 From: Sunil G Date: Fri, 1 Jun 2018 13:24:01 +0530 Subject: [PATCH] YARN-8220 --- hadoop-project/src/site/site.xml | 5 + .../hadoop-yarn-deep-learning-frameworks/pom.xml | 27 +++ .../base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 | 64 +++++ .../base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 | 69 ++++++ .../ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 | 62 +++++ .../ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 | 67 ++++++ .../ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 | 72 ++++++ .../ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 | 79 ++++++ .../ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 | 69 ++++++ .../ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 | 77 ++++++ .../scripts/example_distributed_tf_job_spec.json | 56 +++++ .../scripts/example_standalone_tf_job_spec.json | 31 +++ .../tensorflow/scripts/presetup-tf.sh_template | 34 +++ .../src/main/tensorflow/scripts/submit_tf_job.py | 264 +++++++++++++++++++++ .../src/site/markdown/Dockerfile.md | 105 ++++++++ .../src/site/markdown/Overview.md | 25 ++ .../markdown/RunTensorflowJobUsingHelperScript.md | 239 +++++++++++++++++++ .../RunTensorflowJobUsingNativeServiceSpec.md | 247 +++++++++++++++++++ .../src/site/markdown/TensorflowOnYarnTutorial.md | 137 +++++++++++ .../src/site/site.xml | 28 +++ .../hadoop-yarn/hadoop-yarn-applications/pom.xml | 1 + 21 files changed, 1758 insertions(+) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml index 40df7c5e854..859a192bb72 100644 --- a/hadoop-project/src/site/site.xml +++ b/hadoop-project/src/site/site.xml @@ -179,6 +179,11 @@ + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml new file mode 100644 index 00000000000..52466938274 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml @@ -0,0 +1,27 @@ + + + + + hadoop-yarn-applications + org.apache.hadoop + 3.2.0-SNAPSHOT + + 4.0.0 + + hadoop-yarn-deep-learning-frameworks + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 new file mode 100644 index 00000000000..3d19bf92509 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 new file mode 100644 index 00000000000..db144da7fc7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +LABEL maintainer="Craig Citro " + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +# Install TensorFlow CPU version from central repo +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 new file mode 100644 index 00000000000..c1d0c0ca7ab --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp27-none-linux_x86_64.whl + + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 new file mode 100644 index 00000000000..dee6e195717 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ + build-essential \ + cuda-command-line-tools-9-0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + curl \ + libcudnn7=7.0.5.15-1+cuda9.0 \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 new file mode 100644 index 00000000000..02a21191013 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz + +# Include models +RUN apt-get update && apt-get install git -y + +RUN mkdir /test +RUN cd /test && git clone https://github.com/dsindex/tensorflow +RUN cd /test && git clone https://github.com/tensorflow/models/ +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 new file mode 100644 index 00000000000..1a412745278 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +LABEL maintainer="Craig Citro " + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +# Install TensorFlow CPU version from central repo +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz + +# Include models +RUN apt-get update && apt-get install git -y + +RUN mkdir /test +RUN cd /test && git clone https://github.com/dsindex/tensorflow + +# Clone from fixed repo since the existing cifar10 example doesn't work with 1.8.0 +RUN cd /test && git clone -b master-1.7.0 https://github.com/leftnoteasy/models/ +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 new file mode 100644 index 00000000000..3373098b38e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz + +# Include models +RUN apt-get update && apt-get install git -y + +RUN mkdir /test +RUN cd /test && git clone https://github.com/dsindex/tensorflow +RUN cd /test && git clone https://github.com/tensorflow/models/ +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 new file mode 100644 index 00000000000..f4b8eb12afc --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ + build-essential \ + cuda-command-line-tools-9-0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + curl \ + libcudnn7=7.0.5.15-1+cuda9.0 \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz + +# Include models +RUN apt-get update && apt-get install git -y + +RUN mkdir /test +RUN cd /test && git clone https://github.com/dsindex/tensorflow + +# Clone from fixed repo since the existing cifar10 example doesn't work with 1.8.0 +RUN cd /test && git clone -b master-1.7.0 https://github.com/leftnoteasy/models/ +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json new file mode 100644 index 00000000000..04e61112cf2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json @@ -0,0 +1,56 @@ + +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=2", + "number_of_containers": 1 + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=2", + "number_of_containers": 1 + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048" + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=0", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json new file mode 100644 index 00000000000..f928ba8d95f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json @@ -0,0 +1,31 @@ + +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=1", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template new file mode 100644 index 00000000000..da4b84dacce --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assumptions: +# 1. java is assumed to be installed under /usr/lib/jvm/java-8-openjdk-amd64 +# 2. Using bash shell + +export HADOOP_HDFS_HOME= + +# Intentionally leave HADOOP_HOME and HADOOP_YARN_HOME empty +export HADOOP_HOME= +export HADOOP_YARN_HOME= + +# Setup $HADOOP_CONF_DIR, by default submit_tf_job.py mounts cluster +# Hadoop configs to /etc/hadoop/conf +export HADOOP_CONF_DIR=/etc/hadoop/conf + +export JAVA_HOME= +export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`:$HADOOP_CONF_DIR +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py new file mode 100644 index 00000000000..9d038e51c55 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import json +import os + + +def get_component_array(name, count, hostname_suffix): + component = '\\"' + name + '\\":' + component_names = '[' + for i in xrange(0, count): + component_names = component_names + '\\' + '"' + name + "-" + str( + i) + hostname_suffix + '\\"' + if i != count - 1: + component_names = component_names + ',' + component_names = component_names + ']' + return component + component_names + + +def get_key_value_pair(name, keys, values, count): + block_name = '\\"' + name + '\\":' + block_values = '' + if count == 1: + block_values = block_values + '\\' + '"' + values[0] + '\\"' + return block_name + block_values + block_values = '{' + for i in xrange(0, count): + block_values = block_values + '\\' + '"' + keys[i] + '\\"' + ':' + \ + values[i] + if i != count - 1: + block_values = block_values + ',' + block_values = block_values + '}' + return block_name + block_values + + +def handle_distributed_tf_config_env(tf_json, username, domain): + if username is None or username == '': + raise Exception("Empty username specified, please double check") + if domain is None or domain == '': + raise Exception("Empty domain name specified, please double check") + + num_worker = -1 + num_ps = -1 + num_master = -1 + + # Do we need to generate tf_config? First get unique component names + for c in tf_json['components']: + name = c['name'] + if name == 'worker': + num_worker = int(c['number_of_containers']) + elif name == 'ps': + num_ps = int(c['number_of_containers']) + elif name == 'master': + num_master = int(c['number_of_containers']) + + if num_worker < 0 or num_ps < 0 or num_master != 1: + raise Exception( + "Should include worker/ps/master, all with >0 number_of_containers. " + "Master's number_of_containers must be equal to 1") + + hostname_suffix = "." + tf_json[ + 'name'] + "." + username + "." + domain + ":8000" + cluster = '{' + '\\"cluster' + '\\":{' + master = get_component_array("master", 1, hostname_suffix) + "," + ps = get_component_array("ps", num_ps, hostname_suffix) + "," + worker = get_component_array("worker", num_worker, hostname_suffix) + "}," + component_name = '\\"' + "${COMPONENT_NAME}" + '\\"' + component_id = "${COMPONENT_ID}" + task = get_key_value_pair("task", ["type", "index"], + [component_name, component_id], 2) + "," + environment = get_key_value_pair("environment", "", ["cloud"], 1) + "}" + tf_config_op = cluster + master + ps + worker + task + environment + tf_json['configuration']['env']['TF_CONFIG'] = tf_config_op + + +if __name__ == "__main__": + # Instantiate the parser + parser = argparse.ArgumentParser( + description='Submit Tensorflow job to YARN.') + + # Required positional argument + parser.add_argument('--remote_conf_path', type=str, + help='Remote Configuration path to run TF job' + ' should include core-site.xml/hdfs-site.xml' + '/presetup-tf.sh, etc.', + required=True) + parser.add_argument('--input_spec', type=str, + help='Yarnfile specification for TF job.', + required=True) + parser.add_argument('--docker_image', type=str, + help='Docker image name for TF job.', required=False) + parser.add_argument('--env', type=str, + help='Environment variables needed for TF job in' + ' key=value format.', + required=False) + parser.add_argument('--dry_run', action='store_true', + help='When this is not specified (default behavior), ' + 'YARN service will be automatically submitted. ' + 'When this is specified, generated YARN service' + ' spec will be printed to stdout') + parser.add_argument('--job_name', type=str, + help='Specify job name of the Tensorflow job, which ' + 'will overwrite the one specified in input spec ' + 'file', + required=False) + parser.add_argument('--user', type=str, + help='Specify user name if it is different from $USER ' + '(e.g. kinit user)', + required=False) + parser.add_argument('--domain', type=str, + help='Cluster domain name, which should be same as ' + 'hadoop.registry.dns.domain-name in yarn-site.xml' + ', required for distributed Tensorflow', + required=False) + parser.add_argument('--distributed', action='store_true', + help='Running distributed Tensorflow, if this is ' + 'specified, worker/ps/master must be included ' + 'inside input spec') + parser.add_argument('--kerberos', action='store_true', + help='Is this a kerberos-enabled cluster or not') + parser.add_argument('--verbose', action='store_true', + help='Print debug information') + args = parser.parse_args() + + remote_path = args.remote_conf_path + input_json_spec = args.input_spec + do_dry_run = args.dry_run + envs_array = [] + if hasattr(args, 'env'): + envs = args.env + if envs is not None: + envs_array = envs.split(',') + verbose = args.verbose + if hasattr(args, 'user'): + user = args.user + if user is None: + user = os.environ['USER'] + if hasattr(args, 'domain'): + domain = args.domain + if hasattr(args, 'job_name'): + job_name = args.job_name + distributed = args.distributed + docker_image = None + if hasattr(args, 'docker_image'): + docker_image = args.docker_image + kerberos = args.kerberos + + # Only print when verbose + if verbose: + print "remote_path=", remote_path + print "input_spec_file=", input_json_spec + print "do_dry_run=", do_dry_run + print "user=", user + + with open(input_json_spec) as json_file: + data = json_file.read() + tf_json = json.loads(data) + + if job_name is not None: + tf_json['name'] = job_name + + # Updating per-component commands with presetup-tf.sh + for component in tf_json['components']: + # Append presetup-tf.sh to launch command + launch_cmd = '. resources/presetup-tf.sh && ' + component['launch_command'] + component['launch_command'] = launch_cmd + + if verbose: + print "New launch command = ", launch_cmd + + if docker_image is not None and len(docker_image) > 0: + component['artifact'] = {} + component['artifact']['id'] = docker_image + component['artifact']['type'] = "DOCKER" + + if verbose: + print "Using docker image=", docker_image + + artifact = component.get('artifact') + if artifact is None or artifact.get('id') is None: + raise Exception("Docker image for components doesn't set, please" + " either set it in input spec or by passing " + "--docker-image commandline") + + if distributed: + handle_distributed_tf_config_env(tf_json, user, domain) + + # Update conf files to mount in files section. + spec_envs = tf_json['configuration']['env'] + docker_mounts = '' + + if spec_envs is not None and \ + spec_envs.get('YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS') is not None: + docker_mounts = spec_envs['YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS'] + + srcfiles, destfiles = [], [] + srcfiles.append(remote_path + '/core-site.xml') + srcfiles.append(remote_path + '/hdfs-site.xml') + srcfiles.append(remote_path + '/presetup-tf.sh') + destfiles.append("core-site.xml") + destfiles.append("hdfs-site.xml") + destfiles.append("presetup-tf.sh") + + if len(docker_mounts) > 0: + docker_mounts = docker_mounts + "," + docker_mounts = docker_mounts + \ + "resources/core-site.xml:/etc/hadoop/conf/core-site.xml:ro," \ + "resources/hdfs-site.xml:/etc/hadoop/conf/hdfs-site.xml:ro" + + if kerberos: + srcfiles.append(remote_path + '/krb5.conf') + destfiles.append('krb5.conf') + docker_mounts = docker_mounts + ",resources/krb5.conf:/etc/krb5.conf:ro" + + docker_mounts = docker_mounts + ",/etc/passwd:/etc/passwd:ro" + \ + ",/etc/group:/etc/group:ro" + file_envs = [{"type": "STATIC", "dest_file": d, "src_file": s} for d, s in + zip(destfiles, srcfiles)] + tf_json['configuration']['files'] = file_envs + + envs_array.append('YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=' + docker_mounts) + + # Fetch all envs passed and update in common configuration section + for env in envs_array: + if verbose: + print "Setting env=", env + key_value = env.split('=') + tf_json['configuration']['env'][key_value[0]] = key_value[1] + + jstr = json.dumps(tf_json, sort_keys=False, indent=2) + + print ("============= Begin of generated YARN file ==============") + print(jstr) + print ("============= End of generated YARN file ==============") + + # submit to YARN + if do_dry_run: + print("Skip submit job to YARN.") + else: + print("Submitting job to YARN.") + filename = "/tmp/tensor-flow-yarn-spec-" + user + "-" + str(time.time()) + ".json" + f = open(filename, "w") + f.write(jstr) + f.close() + cmd = "yarn app -launch " + job_name + " " + filename + print("Executing '" + cmd + "'") + os.system("yarn app -launch " + job_name + " " + filename) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md new file mode 100644 index 00000000000..768dbd64e39 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md @@ -0,0 +1,105 @@ + + +# Creating Docker Images for Running Tensorflow on YARN + +## How to create docker images to run Tensorflow on YARN + +Dockerfile to run Tensorflow on YARN need two part: + +**Base libraries which Tensorflow depends on** + +1) OS base image, for example ```ubuntu:16.04``` + +2) Tensorflow depended libraries and packages. For example ```python```, ```scipy```. For GPU support, need ```cuda```, ```cudnn```, etc. + +3) Tensorflow package. + +**Libraries to access HDFS** + +1) JDK + +2) Hadoop + +Here's an example of a base image (w/o GPU support) to install Tensorflow: +``` +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl +``` + +On top of above image, add files, install packages to access HDFS +``` +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz +``` + +Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step. + +## Use examples to build your own Tensorflow docker images + +We provided following examples for you to build tensorflow docker images. + +For Tensorflow 1.3.0 (Precompiled to CUDA 8.x) + +- *base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0*: Tensorflow 1.3.0 supports CPU only. +- *with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0*: Tensorflow 1.3.0 supports CPU only, and included models +- *base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0*: Tensorflow 1.3.0 supports GPU, which is prebuilt to CUDA8. +- *with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0*: Tensorflow 1.3.0 supports GPU, which is prebuilt to CUDA8, with models. + +For Tensorflow 1.8.0 (Precompiled to CUDA 9.x) + +- *base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only. +- *with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models +- *base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9. +- *with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md new file mode 100644 index 00000000000..0decbb0a989 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md @@ -0,0 +1,25 @@ + + +# Deep Learning Frameworks Running on YARN + +## Contents +Yarn Service framework provides first class support to host long running services natively in YARN. This provides an easier way for users to run Deep Learning workloads on top of YARN. This document summarizes steps and utilities available in YARN to ease the complexities in running Machine Learning workloads. Few such supported Deep Learning frameworks are as follows: + +### Running Tensorflow on YARN + * [Tutorials: Running Tensorflow on YARN](TensorflowOnYarnTutorial.html) + * [How to write Dockerfile to run Tensorflow on YARN](Dockerfile.html) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md new file mode 100644 index 00000000000..0e50df94f45 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md @@ -0,0 +1,239 @@ + + +# Run Tensorflow Jobs Using Helper Script + +## Prerequisites +1) Sufficient permissions (authorization and authentication, like `kinit`) are needed for the user (in case of secure cluster) to run `submit_tf_job.py` script as it also supports submitting Tensorflow service directly to YARN. + +2) User could be use `--input_spec` argument to specify a sample spec file as a template. This package has `example_tf_job_spec.json` sample spec file and user could edit this file to specify resources per component (such as memory, cpu, gpu etc) and kerberos specification if needed. + +## Setup presetup-tf.sh template +1) Rename `presetup-tf.sh_template` to `presetup-tf.sh` + +2) In `presetup-tf.sh` + + - Update valid `HADOOP_HDFS_HOME`. This should point to `HADOOP_HDFS_HOME` **inside the docker image**. + + - Update `JAVA_HOME` as per the environment setup. This should point to `JAVA_HOME` **inside the docker image**. + +3) Place `presetup-tf.sh` in HDFS under `hdfs://host:port//`. + +4) Ensure that `` is accessible with correct permission for user. + +5) Upload core-site.xml, hdfs-site.xml to ``. + +6) (when security is enabled) Upload krb5.conf to ``. + +## Run `submit_tf_job.py` to submit Tensorflow job to YARN + +User could run below command to submit Tensorflow job to YARN or to generate valid Yarnfile for the job. + +`python submit_tf_job.py --remote_conf_path --input_spec --docker_image --env --job_name --user --domain --distributed --kerberos` + +Detailed argument summary for `submit_tf_job.py` command. + +``` +optional arguments: + -h, --help show this help message and exit + --remote_conf_path REMOTE_CONF_PATH + Remote Configuration path to run TF job should include + core-site.xml/hdfs-site.xml/presetuptf.sh, etc. + --input_spec INPUT_SPEC + Yarnfile specification for TF job. + --docker_image DOCKER_IMAGE + Docker image name for TF job. + --env ENV Environment variables needed for TF job in key=value + format. + --dry_run When this is not specified (default behavior), YARN + service will be automatically submited. When this is + specified, generated YARN service spec will be + printed to stdout + --job_name JOB_NAME Specify job name of the Tensorflow job, which will + overwrite the one specified in input spec file + --user USER Specify user name if it is different from $USER (e.g. + kinit user) + --domain DOMAIN Cluster domain name, which should be same as + hadoop.registry.dns.domain-name in yarn-site.xml, + required for distributed Tensorflow + --distributed Running distributed tensorflow, if this is specified, + worker/ps/master must be included inside input spec + --kerberos Is this a kerberos-enabled cluster or not + --verbose Print debug information +``` + +Example: +`python submit_tf_job.py --input_spec example_tf_job_spec.json --docker_image tf-gpu:ubuntu-xyz --job_name distributed-tf --user ambari-qa --domain --remote_conf_path hdfs:///tf-job-conf/configs --distributed` + +## Provide `input-spec` file to run Tensorflow jobs + +### Run standalone TF job. + +``` + +{ + "name": "standalone-tf", + "version": "1.0.0", + "components": [ + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} + +``` + +Notes: + +- `hdfs://host:port//presetup-tf.sh` will be automatically downloaded and mounted to the docker container, it will be executed before invoking `launch_command` of components specified in the spec. +- Component name can be customized (In above example it uses `worker`) +- In `resource` section, you can specify cpu/memory/gpu if you needed. +- Additional environment variables can be specified under `env`. This will be passed to launched docker container process. + +### Run distributed TF job. + +``` +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=1", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} +``` + +Notes: (In addition to standalone TF spec) + +- For distributed Tensorflow launch spec, `master`, `worker`, `ps` components are mandatory. +- Different value of `num_of_containers` can be specified for `worker` and `ps`. +- `TF_CONFIG` will be automatically generated and insert to spec to launch the job to do distributed training, you don't need to worry about this. +- (Very important) `--distributed` must be specified when run distributed tensorflow training. + +### Addition information about writing Yarnfile + +#### Security + +YARN Native Services provide an easy way to run services in a secure cluster by providing keytabs and kerberos principal as simple json input parameters. +``` +"kerberos_principal" : { + "principal_name" : , + "keytab" : +} +``` +User could define the kerberos principal name (eg: test-user@EXAMPLE.COM) and keytab file path from HDFS or local file system. Given these information, user could run service from a secured shell. + +#### Choosing correct Docker images for the application +``` +"artifact" : { + "id" : , + "type" : "DOCKER" +} +``` +Under each service component, user need to provide docker image name so that native service will use this image to launch containers. +User can use `--docker_image` to overwrite whatever defined in the input job spec. + +## General Guidelines + +1) For many env configurations, extra escape characters are used so that native service could export correct env's as per guideline. An improvement to this is ongoing in YARN-8257. + +2) In secure clusters, user need to ensure that app is launched from a secure shell. (e.g. with proper Kerberos token). + +## End-to-end example: + +### Run Cifar10 distributed Tensorflow training on GPU/security-enabled cluster + +#### Launch Command +``` +python submit_tf_job.py --remote_conf_path hdfs:///tf-job-conf --input_spec example_tf_job_spec.json --docker_image gpu.cuda_9.0.tf_1.8.0 --job_name distributed-tf-gpu --user tf-user --domain tensorflow.site --distributed --kerberos +``` + +- `docker_image` file could be found under `tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0` from Hadoop codebase and we assume docker image is created named as `gpu.cuda_9.0.tf_1.8.0` from this file. +- `input_spec` file could be found under `tensorflow/scripts/example_tf_job_spec.json` from Hadoop codebase and make the necessary edits as needed. \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md new file mode 100644 index 00000000000..b95676a0e73 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md @@ -0,0 +1,247 @@ + + +## Using raw YARN native service spec to run job + +Yarnfile is a normal JSON file, typically you should save the Yarnfile to a local file and use following command to run: + +``` +yarn app -launch distributed-tf +``` + +Or you can use curl to post the Yarnfile. + +``` +hadoop fs -rmr /tmp/cifar-10-jobdir; +yarn application -destroy distributed-tf; +curl --negotiate -u: -H "Content-Type: application/json" \ + -X POST http://:8088/app/v1/services -d '... content of Yarnfile...' +``` + +## Example Yarnfiles + +### Generate Dockerfiles + +Please refer to [Dockerfile for running on Tensorflow on YARN](Dockerfile.html) for more details. + +### Single node Tensorflow (with GPU and access Kerberorized HDFS) + +``` +{ + "name": "single-node-tensorflow", + "version": "1.0.0", + "components": [ + { + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export HADOOP_CONF_DIR=/etc/hadoop/conf; export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/models/tutorials/image/cifar10_estimator && ls -l && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=1 --train-batch-size=16 --train-steps=40000", + "number_of_containers": 1, + "run_privileged_container": false + } + ], + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} +``` + +### Distributed Tensorflow (with CPU and access non-Kerberorized HDFS) + +#### Generate TF_CONFIG + +```TF_CONFIG``` is an environment variable which passes training parameters to Tensorflow. It is widely used to run distributed Tensorflow training. + +Here's an example of ```TF_CONFIG``` + +``` +{ + "cluster":{ + "master":[ + ":" + ], + "ps":[ + ":", + ":", + ":" + ... + ], + "worker":[ + ":", + ":", + ":" + ... + ] + }, + "task":{ + "type": "worker", + "index": 0 + }, + "environment":"cloud" +} +``` + +It includes two parts, the first is ```cluster```. ```cluster``` is a collection of endpoints of all roles of a Tensorflow job. Roles include: + +- ```ps```: saves the parameters among all workers. All workers can read/write/update the parameters for model via ps. As some models are extremely large the parameters are shared among the ps (each ps stores a subset). +- ```worker```: does the training. +- ```master```: basically a special worker, it does training, but also restores and saves checkpoints and do evaluation. + +```cluster``` part is identical to all roles of a given Tensorflow job. + +(Description of these roles copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) + +The second is ```task```, which describes role of the launched process, which is different for different roles. For example, if ```task``` is specified to: +``` + "task":{ + "type": "worker", + "index": 0 + } +``` +The launched instance will use: ```:``` as endpoint. + +Following script can be used to generate ```TF_CONFIG```: + +``` +import sys +def get_component_array(name, count, hostname_suffix): + component = "\\\\" + '\\"' + name + "\\\\" + '\\":' + component_names = '[' + for i in xrange(0, count): + component_names = component_names + "\\\\" + '\\' + '"' + name + "-" + str(i) + hostname_suffix + "\\\\" + '\\"' + if i != count - 1: + component_names = component_names + ',' + component_names = component_names + ']' + return component + component_names +def get_key_value_pair(name, keys, values, count): + block_name = "\\\\" + '\\"' + name + "\\\\" + '\\":' + block_values = '' + if count == 1: + block_values = block_values + '\\' + "\\\\" + '"' + values[0] + "\\\\" + '\\"' + return block_name + block_values + block_values = '{' + for i in xrange(0, count): + block_values = block_values + "\\\\" + '\\' + '"' + keys[i] + "\\\\" + '\\"' + ':' + values[i] + if i != count - 1: + block_values = block_values + ',' + block_values = block_values + '}' + return block_name + block_values +# Generate TF_CONFIG from username and domain name. Use this to create an ENV variable which could be used as env in native service spec. +if len (sys.argv) != 6 : + print "Usage: python generate_tf_config.py " + sys.exit (1) +username = sys.argv[1] +domain = sys.argv[2] +servicename = sys.argv[3] +num_worker = int(sys.argv[4]) +num_ps = int(sys.argv[5]) +hostname_suffix = "." + servicename + "."+ username + "." + domain + ":8000" +cluster = '"{' + "\\\\" + '\\"cluster' + "\\\\" + '\\":{' +master = get_component_array("master", 1, hostname_suffix) + "," +ps = get_component_array("ps", num_ps, hostname_suffix) + "," +worker = get_component_array("worker", num_worker, hostname_suffix) + "}," +component_name = "\\\\" + '\\"' + "${COMPONENT_NAME}" + "\\\\" + '\\"' +component_id = "${COMPONENT_ID}" +task = get_key_value_pair("task", ["type", "index"], [component_name, component_id], 2) + "," +env = get_key_value_pair("environment", "", ["cloud"], 1) + '}"' +print '"{}"'.format("TF_CONFIG"), ":" , cluster, master, ps, worker, task, env +``` + +Running + +``` +python path/to/saved/python-file example.com distributed-tf 10 3 +``` + +Generates ```TF_CONFIG``` for given user_name, domain name at example.com (which is same as ```hadoop.registry.dns.domain-name``` in ```yarn-site.xml```), service name is ```distributed-tf```. 10 workers (exclude master), and 3 parameter servers. The python script can be tailored according to your cluster environment. + +#### Yarnfile +``` +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=0 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1, + "run_privileged_container": false + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=0 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1, + "run_privileged_container": false + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/models/tutorials/image/cifar10_estimator && ls -l && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=0", + "number_of_containers": 1, + "run_privileged_container": false + } + ], + "configuration": { + "properties": {}, + "env": { + "TF_CONFIG" : , + "HADOOP_CONF_DIR" : "/etc/hadoop/conf", + "JAVA_HOME" : "/usr/lib/jvm/java-8-openjdk-amd64/jre/", + "YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK": "bridge" + } + } +} +``` diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md new file mode 100644 index 00000000000..cb07bbda1e1 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md @@ -0,0 +1,137 @@ + +# Tutorial: Running Tensorflow on YARN + +## Prepare data for training + +CIFAR-10 is a common benchmark in machine learning for image recognition. Below example is based on CIFAR-10 dataset. + +1) Checkout https://github.com/tensorflow/models/: +``` +git clone https://github.com/tensorflow/models/ +``` + +2) Go to `models/tutorials/image/cifar10_estimator` + +3) Generate data by using following command: (required Tensorflow installed) + +``` +python generate_cifar10_tfrecords.py --data-dir=/tmp/cifar-10-data +``` + +4) Upload data to HDFS + +``` +hadoop fs -put /tmp/cifar-10-data/ /tmp/ +``` + +**Please note that:** + +a. All following examples are using ```/tmp/cifar-10-jobdir``` as snapshot directory for training, so suggest to run: +``` +hadoop fs -rmr /tmp/cifar-10-jobdir +``` +to cleanup snapshot between runs. + +b. YARN service doesn't allow multiple services with the same name, so please run following command +``` +yarn application -destroy +``` +to delete services if you want to reuse the same service name. + +## Run Tensorflow jobs + +### Use auto generated Yarnfile and run job + +With the help of custom python scripts, user can run Tensorflow jobs easily by auto creating Yarnfile. + +Please refer to [Run Tensorflow Job Using Helper Script](RunTensorflowJobUsingHelperScript.html) for more details. This is also recommended approach. + +### Use raw YARN service spec to run job + +This is useful if you want to understand things happening behind-the-scene. But this is not recommended for end-user to use. + +Please refer to [Run Tensorflow Job Using Raw Native Service Spec](RunTensorflowJobUsingNativeServiceSpec.html) for more details. + +## Outputs + +Sample output of master: +``` +... +allow_soft_placement: true +, '_tf_random_seed': None, '_task_type': u'master', '_environment': u'cloud', '_is_chief': True, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:29:14.656022: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> localhost:8000} +2018-05-06 22:29:14.656097: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.656112: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.659359: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:Restoring parameters from hdfs://default/tmp/cifar-10-jobdir/model.ckpt-0 +INFO:tensorflow:Evaluation [1/625] +INFO:tensorflow:Evaluation [2/625] +INFO:tensorflow:Evaluation [3/625] +INFO:tensorflow:Evaluation [4/625] +INFO:tensorflow:Evaluation [5/625] +INFO:tensorflow:Evaluation [6/625] +... +INFO:tensorflow:Validation (step 1): loss = 1220.6445, global_step = 1, accuracy = 0.1 +INFO:tensorflow:loss = 6.3980675, step = 0 +INFO:tensorflow:loss = 6.3980675, learning_rate = 0.1 +INFO:tensorflow:global_step/sec: 2.34092 +INFO:tensorflow:Average examples/sec: 1931.22 (1931.22), step = 100 +INFO:tensorflow:Average examples/sec: 354.236 (38.6479), step = 110 +INFO:tensorflow:Average examples/sec: 211.096 (38.7693), step = 120 +INFO:tensorflow:Average examples/sec: 156.533 (38.1633), step = 130 +INFO:tensorflow:Average examples/sec: 128.6 (38.7372), step = 140 +INFO:tensorflow:Average examples/sec: 111.533 (39.0239), step = 150 +``` + +Sample output of worker: +``` +, '_tf_random_seed': None, '_task_type': u'worker', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:45.807936: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808040: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808064: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:8000} +2018-05-06 22:28:45.809919: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:loss = 5.319096, step = 0 +INFO:tensorflow:loss = 5.319096, learning_rate = 0.1 +INFO:tensorflow:Average examples/sec: 49.2338 (49.2338), step = 10 +INFO:tensorflow:Average examples/sec: 52.117 (55.3589), step = 20 +INFO:tensorflow:Average examples/sec: 53.2754 (55.7541), step = 30 +INFO:tensorflow:Average examples/sec: 53.8388 (55.6028), step = 40 +INFO:tensorflow:Average examples/sec: 54.1082 (55.2134), step = 50 +INFO:tensorflow:Average examples/sec: 54.3141 (55.3676), step = 60 +``` + +Sample output of ps: +``` +... +, '_tf_random_seed': None, '_task_type': u'ps', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:42.562316: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.562408: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} +2018-05-06 22:28:42.562433: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.564242: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +``` \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml new file mode 100644 index 00000000000..a52d1638b55 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml @@ -0,0 +1,28 @@ + + + + + org.apache.maven.skins + maven-stylus-skin + ${maven-stylus-skin.version} + + + + + + + + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml index 490e9ad5b93..71b2bc3ae7d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml @@ -37,6 +37,7 @@ hadoop-yarn-applications-distributedshell hadoop-yarn-applications-unmanaged-am-launcher hadoop-yarn-services + hadoop-yarn-deep-learning-frameworks -- 2.14.3 (Apple Git-98)