diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml index 40df7c5e854..859a192bb72 100644 --- a/hadoop-project/src/site/site.xml +++ b/hadoop-project/src/site/site.xml @@ -179,6 +179,11 @@ + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml new file mode 100644 index 00000000000..3a0580199fc --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/pom.xml @@ -0,0 +1,43 @@ + + + + + hadoop-yarn-applications + org.apache.hadoop + 3.2.0-SNAPSHOT + + 4.0.0 + + hadoop-yarn-deep-learning-frameworks + + + + + org.apache.rat + apache-rat-plugin + + + **/*.json + **/*.yarnfile + **/*.yaml + + + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 new file mode 100644 index 00000000000..3d19bf92509 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 new file mode 100644 index 00000000000..db144da7fc7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +LABEL maintainer="Craig Citro " + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # +# These lines will be edited automatically by parameterized_docker_build.sh. # +# COPY _PIP_FILE_ / +# RUN pip --no-cache-dir install /_PIP_FILE_ +# RUN rm -f /_PIP_FILE_ + +# Install TensorFlow CPU version from central repo +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 new file mode 100644 index 00000000000..c1d0c0ca7ab --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp27-none-linux_x86_64.whl + + +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 new file mode 100644 index 00000000000..dee6e195717 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ + build-essential \ + cuda-command-line-tools-9-0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + curl \ + libcudnn7=7.0.5.15-1+cuda9.0 \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl +RUN apt-get update && apt-get install git -y + +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 new file mode 100644 index 00000000000..293f81b2250 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM + +# Include models +RUN mkdir /test +ADD cifar10_estimator_tf_1.6.0 /test/cifar10_estimator +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 new file mode 100644 index 00000000000..977fcf336ab --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM + +# Include models +RUN mkdir /test +ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 new file mode 100644 index 00000000000..3ee856efb38 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM + +# Include models +RUN mkdir /test +ADD cifar10_estimator_tf_1.6.0 /test/cifar10_estimator +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 new file mode 100644 index 00000000000..506faba8bfa --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM + +# Include models +RUN mkdir /test +ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator +RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/README.md new file mode 100644 index 00000000000..66b4bc2d39e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/README.md @@ -0,0 +1,542 @@ + + +(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) + +CIFAR-10 is a common benchmark in machine learning for image recognition. + +http://www.cs.toronto.edu/~kriz/cifar.html + +Code in this directory focuses on how to use TensorFlow Estimators to train and +evaluate a CIFAR-10 ResNet model on: + +* A single host with one CPU; +* A single host with multiple GPUs; +* Multiple hosts with CPU or multiple GPUs; + +Before trying to run the model we highly encourage you to read all the README. + +## Prerequisite + +1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or +later. + +2. Download the CIFAR-10 dataset and generate TFRecord files using the provided +script. The script and associated command below will download the CIFAR-10 +dataset and then generate a TFRecord for the training, validation, and +evaluation datasets. + +```shell +python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data +``` + +After running the command above, you should see the following files in the +--data-dir (```ls -R cifar-10-data```): + +* train.tfrecords +* validation.tfrecords +* eval.tfrecords + + +## Training on a single machine with GPUs or CPU + +Run the training on CPU only. After training, it runs the evaluation. + +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --num-gpus=0 \ + --train-steps=1000 +``` + +Run the model on 2 GPUs using CPU as parameter server. After training, it runs +the evaluation. +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --num-gpus=2 \ + --train-steps=1000 +``` + +Run the model on 2 GPUs using GPU as parameter server. +It will run an experiment, which for local setting basically means it will run +stop training +a couple of times to perform evaluation. + +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --variable-strategy GPU \ + --num-gpus=2 \ +``` + +There are more command line flags to play with; run +`python cifar10_main.py --help` for details. + +## Run distributed training + +### (Optional) Running on Google Cloud Machine Learning Engine + +This example can be run on Google Cloud Machine Learning Engine (ML Engine), +which will configure the environment and take care of running workers, +parameters servers, and masters in a fault tolerant way. + +To install the command line tool, and set up a project and billing, see the +quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line). + +You'll also need a Google Cloud Storage bucket for the data. If you followed the +instructions above, you can just run: + +``` +MY_BUCKET=gs:// +gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/ +``` + +Then run the following command from the `tutorials/image` directory of this +repository (the parent directory of this README): + +``` +gcloud ml-engine jobs submit training cifarmultigpu \ + --runtime-version 1.2 \ + --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \ + --config cifar10_estimator/cmle_config.yaml \ + --package-path cifar10_estimator/ \ + --module-name cifar10_estimator.cifar10_main \ + -- \ + --data-dir=$MY_BUCKET/cifar-10-data \ + --num-gpus=4 \ + --train-steps=1000 +``` + + +### Set TF_CONFIG + +Considering that you already have multiple hosts configured, all you need is a +`TF_CONFIG` environment variable on each host. You can set up the hosts manually +or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for +instructions about how to set up a Cluster. + +The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and +their task: `master`, `ps` or `worker`. + +Here's an example of `TF_CONFIG`. + +```python +cluster = {'master': ['master-ip:8000'], + 'ps': ['ps-ip:8000'], + 'worker': ['worker-ip:8000']} + +TF_CONFIG = json.dumps( + {'cluster': cluster, + 'task': {'type': master, 'index': 0}, + 'model_dir': 'gs:///', + 'environment': 'cloud' + }) +``` + +*Cluster* + +A cluster spec, which is basically a dictionary that describes all of the tasks +in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed). + +In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker. + +* `ps`: saves the parameters among all workers. All workers can + read/write/update the parameters for model via ps. As some models are + extremely large the parameters are shared among the ps (each ps stores a + subset). + +* `worker`: does the training. + +* `master`: basically a special worker, it does training, but also restores and + saves checkpoints and do evaluation. + +*Task* + +The Task defines what is the role of the current node, for this example the node +is the master on index 0 on the cluster spec, the task will be different for +each node. An example of the `TF_CONFIG` for a worker would be: + +```python +cluster = {'master': ['master-ip:8000'], + 'ps': ['ps-ip:8000'], + 'worker': ['worker-ip:8000']} + +TF_CONFIG = json.dumps( + {'cluster': cluster, + 'task': {'type': worker, 'index': 0}, + 'model_dir': 'gs:///', + 'environment': 'cloud' + }) +``` + +*Model_dir* + +This is the path where the master will save the checkpoints, graph and +TensorBoard files. For a multi host environment you may want to use a +Distributed File System, Google Storage and DFS are supported. + +*Environment* + +By the default environment is *local*, for a distributed setting we need to +change it to *cloud*. + +### Running script + +Once you have a `TF_CONFIG` configured properly on each host you're ready to run +on distributed settings. + +#### Master +Run this on master: +Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for +40000 steps. It will run evaluation a couple of times during training. The +num_workers arugument is used only to update the learning rate correctly. Make +sure the model_dir is the same as defined on the TF_CONFIG. + +```shell +python cifar10_main.py --data-dir=gs://path/cifar-10-data \ + --job-dir=gs://path/model_dir/ \ + --num-gpus=4 \ + --train-steps=40000 \ + --sync \ + --num-workers=2 +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} +... +2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:04.0 +Total memory: 11.17GiB +Free memory: 11.09GiB +2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:05.0 +Total memory: 11.17GiB +Free memory: 11.10GiB +... +2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1 +INFO:tensorflow:Create CheckpointSaverHook. +INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0 +2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config: +intra_op_parallelism_threads: 1 +gpu_options { + per_process_gpu_memory_fraction: 1 +} +allow_soft_placement: true + +INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt. +INFO:tensorflow:loss = 1.20682, step = 1 +INFO:tensorflow:loss = 1.20682, learning_rate = 0.1 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 +INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14 +2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0) +2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0) +2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0) +2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0) +2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0) +2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0) +2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0) +2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0) +INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023 +INFO:tensorflow:Evaluation [1/100] +INFO:tensorflow:Evaluation [2/100] +INFO:tensorflow:Evaluation [3/100] +INFO:tensorflow:Evaluation [4/100] +INFO:tensorflow:Evaluation [5/100] +INFO:tensorflow:Evaluation [6/100] +INFO:tensorflow:Evaluation [7/100] +INFO:tensorflow:Evaluation [8/100] +INFO:tensorflow:Evaluation [9/100] +INFO:tensorflow:Evaluation [10/100] +INFO:tensorflow:Evaluation [11/100] +INFO:tensorflow:Evaluation [12/100] +INFO:tensorflow:Evaluation [13/100] +... +INFO:tensorflow:Evaluation [100/100] +INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31 +INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425 +``` + +#### Worker + +Run this on worker: +Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for +40000 steps. It will run evaluation a couple of times during training. Make sure +the model_dir is the same as defined on the TF_CONFIG. + +```shell +python cifar10_main.py --data-dir=gs://path/cifar-10-data \ + --job-dir=gs://path/model_dir/ \ + --num-gpus=4 \ + --train-steps=40000 \ + --sync +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, +'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker', +'_is_chief': False, '_cluster_spec': +, +'_model_dir': 'gs:///model_dir/', +'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, +'_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, +'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 + } +... +2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:04.0 +Total memory: 11.17GiB +Free memory: 11.09GiB +2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:05.0 +Total memory: 11.17GiB +Free memory: 11.10GiB +... +2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 +INFO:tensorflow:Create CheckpointSaverHook. +2017-07-31 22:38:04.629150: I +tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting +for response from worker: /job:master/replica:0/task:0 +2017-07-31 22:38:09.263492: I +tensorflow/core/distributed_runtime/master_session.cc:999] Start master +session cc58f93b1e259b0c with config: +intra_op_parallelism_threads: 1 +gpu_options { +per_process_gpu_memory_fraction: 1 +} +allow_soft_placement: true +INFO:tensorflow:loss = 5.82382, step = 0 +INFO:tensorflow:loss = 5.82382, learning_rate = 0.8 +INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10 +INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20 +INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30 +INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40 +INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50 +INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60 +INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70 +INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec) +INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec) +INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80 +INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90 +INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100 +INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110 +INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120 +INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130 +INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140 +INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150 +INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160 +INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170 +INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec) +... +``` + +#### PS + +Run this on ps: +The ps will not do training so most of the arguments won't affect the execution + +```shell +python cifar10_main.py --job-dir=gs://path/model_dir/ +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} +2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000} +2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} +2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000} +2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +``` + +## Visualizing results with TensorBoard + +When using Estimators you can also visualize your data in TensorBoard, with no +changes in your code. You can use TensorBoard to visualize your TensorFlow +graph, plot quantitative metrics about the execution of your graph, and show +additional data like images that pass through it. + +You'll see something similar to this if you "point" TensorBoard to the +`job dir` parameter you used to train or evaluate your model. + +Check TensorBoard during training or after it. Just point TensorBoard to the +model_dir you chose on the previous step. + +```shell +tensorboard --log-dir="" +``` + +## Warnings + +When runninng `cifar10_main.py` with `--sync` argument you may see an error +similar to: + +```python +File "cifar10_main.py", line 538, in + tf.app.run() +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run + _sys.exit(main(_sys.argv[:1] + flags_passthrough)) +File "cifar10_main.py", line 518, in main + hooks), run_config=config) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run + return _execute_schedule(experiment, schedule) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule + return task() +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate + hooks=self._eval_hooks) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate + hooks=hooks) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate + name=name) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model + features, labels, model_fn_lib.ModeKeys.EVAL) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn + features=features, labels=labels, **kwargs) +File "cifar10_main.py", line 331, in _resnet_model_fn + gradvars, global_step=tf.train.get_global_step()) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients + variables.global_variables()) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped + return _add_should_use_warning(fn(*args, **kwargs)) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning + wrapped = TFShouldUseWarningWrapper(x) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__ + stack = [s.strip() for s in traceback.format_stack()] +``` + +This should not affect your training, and should be fixed on the next releases. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10.py new file mode 100644 index 00000000000..5b9c267b351 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10.py @@ -0,0 +1,113 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""CIFAR-10 data set. + +See http://www.cs.toronto.edu/~kriz/cifar.html. +""" +import os + +import tensorflow as tf + +HEIGHT = 32 +WIDTH = 32 +DEPTH = 3 + + +class Cifar10DataSet(object): + """Cifar10 data set. + + Described by http://www.cs.toronto.edu/~kriz/cifar.html. + """ + + def __init__(self, data_dir, subset='train', use_distortion=True): + self.data_dir = data_dir + self.subset = subset + self.use_distortion = use_distortion + + def get_filenames(self): + if self.subset in ['train', 'validation', 'eval']: + return [os.path.join(self.data_dir, self.subset + '.tfrecords')] + else: + raise ValueError('Invalid data subset "%s"' % self.subset) + + def parser(self, serialized_example): + """Parses a single tf.Example into image and label tensors.""" + # Dimensions of the images in the CIFAR-10 dataset. + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + features = tf.parse_single_example( + serialized_example, + features={ + 'image': tf.FixedLenFeature([], tf.string), + 'label': tf.FixedLenFeature([], tf.int64), + }) + image = tf.decode_raw(features['image'], tf.uint8) + image.set_shape([DEPTH * HEIGHT * WIDTH]) + + # Reshape from [depth * height * width] to [depth, height, width]. + image = tf.cast( + tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), + tf.float32) + label = tf.cast(features['label'], tf.int32) + + # Custom preprocessing. + image = self.preprocess(image) + + return image, label + + def make_batch(self, batch_size): + """Read the images and labels from 'filenames'.""" + filenames = self.get_filenames() + # Repeat infinitely. + dataset = tf.contrib.data.TFRecordDataset(filenames).repeat() + + # Parse records. + dataset = dataset.map( + self.parser, num_threads=batch_size, output_buffer_size=2 * batch_size) + + # Potentially shuffle records. + if self.subset == 'train': + min_queue_examples = int( + Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) + # Ensure that the capacity is sufficiently large to provide good random + # shuffling. + dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) + + # Batch it up. + dataset = dataset.batch(batch_size) + iterator = dataset.make_one_shot_iterator() + image_batch, label_batch = iterator.get_next() + + return image_batch, label_batch + + def preprocess(self, image): + """Preprocess a single image in [height, width, depth] layout.""" + if self.subset == 'train' and self.use_distortion: + # Pad 4 pixels on each dimension of feature map, done in mini-batch + image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) + image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) + image = tf.image.random_flip_left_right(image) + return image + + @staticmethod + def num_examples_per_epoch(subset='train'): + if subset == 'train': + return 45000 + elif subset == 'validation': + return 5000 + elif subset == 'eval': + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_main.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_main.py new file mode 100644 index 00000000000..51da6b94fa2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_main.py @@ -0,0 +1,521 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet model for classifying images from CIFAR-10 dataset. + +Support single-host training with one or multiple devices. + +ResNet as proposed in: +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun +Deep Residual Learning for Image Recognition. arXiv:1512.03385 + +CIFAR-10 as in: +http://www.cs.toronto.edu/~kriz/cifar.html + + +""" +from __future__ import division +from __future__ import print_function + +import argparse +import functools +import itertools +import os + +import cifar10 +import cifar10_model +import cifar10_utils +import numpy as np +import six +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +tf.logging.set_verbosity(tf.logging.INFO) + + +def get_model_fn(num_gpus, variable_strategy, num_workers): + """Returns a function that will build the resnet model.""" + + def _resnet_model_fn(features, labels, mode, params): + """Resnet model body. + + Support single host, one or more GPU training. Parameter distribution can + be either one of the following scheme. + 1. CPU is the parameter server and manages gradient updates. + 2. Parameters are distributed evenly across all GPUs, and the first GPU + manages gradient updates. + + Args: + features: a list of tensors, one for each tower + labels: a list of tensors, one for each tower + mode: ModeKeys.TRAIN or EVAL + params: Hyperparameters suitable for tuning + Returns: + A EstimatorSpec object. + """ + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + weight_decay = params.weight_decay + momentum = params.momentum + + tower_features = features + tower_labels = labels + tower_losses = [] + tower_gradvars = [] + tower_preds = [] + + # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) + # on CPU. The exception is Intel MKL on CPU which is optimal with + # channels_last. + data_format = params.data_format + if not data_format: + if num_gpus == 0: + data_format = 'channels_last' + else: + data_format = 'channels_first' + + if num_gpus == 0: + num_devices = 1 + device_type = 'cpu' + else: + num_devices = num_gpus + device_type = 'gpu' + + for i in range(num_devices): + worker_device = '/{}:{}'.format(device_type, i) + if variable_strategy == 'CPU': + device_setter = cifar10_utils.local_device_setter( + worker_device=worker_device) + elif variable_strategy == 'GPU': + device_setter = cifar10_utils.local_device_setter( + ps_device_type='gpu', + worker_device=worker_device, + ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( + num_gpus, tf.contrib.training.byte_size_load_fn)) + with tf.variable_scope('resnet', reuse=bool(i != 0)): + with tf.name_scope('tower_%d' % i) as name_scope: + with tf.device(device_setter): + loss, gradvars, preds = _tower_fn( + is_training, weight_decay, tower_features[i], tower_labels[i], + data_format, params.num_layers, params.batch_norm_decay, + params.batch_norm_epsilon) + tower_losses.append(loss) + tower_gradvars.append(gradvars) + tower_preds.append(preds) + if i == 0: + # Only trigger batch_norm moving mean and variance update from + # the 1st tower. Ideally, we should grab the updates from all + # towers but these stats accumulate extremely fast so we can + # ignore the other stats from the other towers without + # significant detriment. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, + name_scope) + + # Now compute global loss and gradients. + gradvars = [] + with tf.name_scope('gradient_averaging'): + all_grads = {} + for grad, var in itertools.chain(*tower_gradvars): + if grad is not None: + all_grads.setdefault(var, []).append(grad) + for var, grads in six.iteritems(all_grads): + # Average gradients on the same device as the variables + # to which they apply. + with tf.device(var.device): + if len(grads) == 1: + avg_grad = grads[0] + else: + avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) + gradvars.append((avg_grad, var)) + + # Device that runs the ops to apply global gradient updates. + consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' + with tf.device(consolidation_device): + # Suggested learning rate scheduling from + # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 + num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( + 'train') // (params.train_batch_size * num_workers) + boundaries = [ + num_batches_per_epoch * x + for x in np.array([82, 123, 300], dtype=np.int64) + ] + staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] + + learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), + boundaries, staged_lr) + + loss = tf.reduce_mean(tower_losses, name='loss') + + examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( + params.train_batch_size, every_n_steps=10) + + tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} + + logging_hook = tf.train.LoggingTensorHook( + tensors=tensors_to_log, every_n_iter=100) + + train_hooks = [logging_hook, examples_sec_hook] + + optimizer = tf.train.MomentumOptimizer( + learning_rate=learning_rate, momentum=momentum) + + if params.sync: + optimizer = tf.train.SyncReplicasOptimizer( + optimizer, replicas_to_aggregate=num_workers) + sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) + train_hooks.append(sync_replicas_hook) + + # Create single grouped train op + train_op = [ + optimizer.apply_gradients( + gradvars, global_step=tf.train.get_global_step()) + ] + train_op.extend(update_ops) + train_op = tf.group(*train_op) + + predictions = { + 'classes': + tf.concat([p['classes'] for p in tower_preds], axis=0), + 'probabilities': + tf.concat([p['probabilities'] for p in tower_preds], axis=0) + } + stacked_labels = tf.concat(labels, axis=0) + metrics = { + 'accuracy': + tf.metrics.accuracy(stacked_labels, predictions['classes']) + } + + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + loss=loss, + train_op=train_op, + training_hooks=train_hooks, + eval_metric_ops=metrics) + + return _resnet_model_fn + + +def _tower_fn(is_training, weight_decay, feature, label, data_format, + num_layers, batch_norm_decay, batch_norm_epsilon): + """Build computation tower (Resnet). + + Args: + is_training: true if is training graph. + weight_decay: weight regularization strength, a float. + feature: a Tensor. + label: a Tensor. + data_format: channels_last (NHWC) or channels_first (NCHW). + num_layers: number of layers, an int. + batch_norm_decay: decay for batch normalization, a float. + batch_norm_epsilon: epsilon for batch normalization, a float. + + Returns: + A tuple with the loss for the tower, the gradients and parameters, and + predictions. + + """ + model = cifar10_model.ResNetCifar10( + num_layers, + batch_norm_decay=batch_norm_decay, + batch_norm_epsilon=batch_norm_epsilon, + is_training=is_training, + data_format=data_format) + logits = model.forward_pass(feature, input_data_format='channels_last') + tower_pred = { + 'classes': tf.argmax(input=logits, axis=1), + 'probabilities': tf.nn.softmax(logits) + } + + tower_loss = tf.losses.sparse_softmax_cross_entropy( + logits=logits, labels=label) + tower_loss = tf.reduce_mean(tower_loss) + + model_params = tf.trainable_variables() + tower_loss += weight_decay * tf.add_n( + [tf.nn.l2_loss(v) for v in model_params]) + + tower_grad = tf.gradients(tower_loss, model_params) + + return tower_loss, zip(tower_grad, model_params), tower_pred + + +def input_fn(data_dir, + subset, + num_shards, + batch_size, + use_distortion_for_training=True): + """Create input graph for model. + + Args: + data_dir: Directory where TFRecords representing the dataset are located. + subset: one of 'train', 'validate' and 'eval'. + num_shards: num of towers participating in data-parallel training. + batch_size: total batch size for training to be divided by the number of + shards. + use_distortion_for_training: True to use distortions. + Returns: + two lists of tensors for features and labels, each of num_shards length. + """ + with tf.device('/cpu:0'): + use_distortion = subset == 'train' and use_distortion_for_training + dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) + image_batch, label_batch = dataset.make_batch(batch_size) + if num_shards <= 1: + # No GPU available or only 1 GPU. + return [image_batch], [label_batch] + + # Note that passing num=batch_size is safe here, even though + # dataset.batch(batch_size) can, in some cases, return fewer than batch_size + # examples. This is because it does so only when repeating for a limited + # number of epochs, but our dataset repeats forever. + image_batch = tf.unstack(image_batch, num=batch_size, axis=0) + label_batch = tf.unstack(label_batch, num=batch_size, axis=0) + feature_shards = [[] for i in range(num_shards)] + label_shards = [[] for i in range(num_shards)] + for i in xrange(batch_size): + idx = i % num_shards + feature_shards[idx].append(image_batch[i]) + label_shards[idx].append(label_batch[i]) + feature_shards = [tf.parallel_stack(x) for x in feature_shards] + label_shards = [tf.parallel_stack(x) for x in label_shards] + return feature_shards, label_shards + + +def get_experiment_fn(data_dir, + num_gpus, + variable_strategy, + use_distortion_for_training=True): + """Returns an Experiment function. + + Experiments perform training on several workers in parallel, + in other words experiments know how to invoke train and eval in a sensible + fashion for distributed training. Arguments passed directly to this + function are not tunable, all other arguments should be passed within + tf.HParams, passed to the enclosed function. + + Args: + data_dir: str. Location of the data for input_fns. + num_gpus: int. Number of GPUs on each worker. + variable_strategy: String. CPU to use CPU as the parameter server + and GPU to use the GPUs as the parameter server. + use_distortion_for_training: bool. See cifar10.Cifar10DataSet. + Returns: + A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> + tf.contrib.learn.Experiment. + + Suitable for use by tf.contrib.learn.learn_runner, which will run various + methods on Experiment (train, evaluate) based on information + about the current runner in `run_config`. + """ + + def _experiment_fn(run_config, hparams): + """Returns an Experiment.""" + # Create estimator. + train_input_fn = functools.partial( + input_fn, + data_dir, + subset='train', + num_shards=num_gpus, + batch_size=hparams.train_batch_size, + use_distortion_for_training=use_distortion_for_training) + + eval_input_fn = functools.partial( + input_fn, + data_dir, + subset='eval', + batch_size=hparams.eval_batch_size, + num_shards=num_gpus) + + num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') + if num_eval_examples % hparams.eval_batch_size != 0: + raise ValueError( + 'validation set size must be multiple of eval_batch_size') + + train_steps = hparams.train_steps + eval_steps = num_eval_examples // hparams.eval_batch_size + + classifier = tf.estimator.Estimator( + model_fn=get_model_fn(num_gpus, variable_strategy, + run_config.num_worker_replicas or 1), + config=run_config, + params=hparams) + + # Create experiment. + return tf.contrib.learn.Experiment( + classifier, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + train_steps=train_steps, + eval_steps=eval_steps) + + return _experiment_fn + + +def main(job_dir, data_dir, num_gpus, variable_strategy, + use_distortion_for_training, log_device_placement, num_intra_threads, + **hparams): + # The env variable is on deprecation path, default is set to off. + os.environ['TF_SYNC_ON_FINISH'] = '0' + os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + + # Session configuration. + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=log_device_placement, + intra_op_parallelism_threads=num_intra_threads, + gpu_options=tf.GPUOptions(force_gpu_compatible=True)) + + config = cifar10_utils.RunConfig( + session_config=sess_config, model_dir=job_dir) + tf.contrib.learn.learn_runner.run( + get_experiment_fn(data_dir, num_gpus, variable_strategy, + use_distortion_for_training), + run_config=config, + hparams=tf.contrib.training.HParams( + is_chief=config.is_chief, + **hparams)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data-dir', + type=str, + required=True, + help='The directory where the CIFAR-10 input data is stored.') + parser.add_argument( + '--job-dir', + type=str, + required=True, + help='The directory where the model will be stored.') + parser.add_argument( + '--variable-strategy', + choices=['CPU', 'GPU'], + type=str, + default='CPU', + help='Where to locate variable operations') + parser.add_argument( + '--num-gpus', + type=int, + default=1, + help='The number of gpus used. Uses only CPU if set to 0.') + parser.add_argument( + '--num-layers', + type=int, + default=44, + help='The number of layers of the model.') + parser.add_argument( + '--train-steps', + type=int, + default=80000, + help='The number of steps to use for training.') + parser.add_argument( + '--train-batch-size', + type=int, + default=128, + help='Batch size for training.') + parser.add_argument( + '--eval-batch-size', + type=int, + default=100, + help='Batch size for validation.') + parser.add_argument( + '--momentum', + type=float, + default=0.9, + help='Momentum for MomentumOptimizer.') + parser.add_argument( + '--weight-decay', + type=float, + default=2e-4, + help='Weight decay for convolutions.') + parser.add_argument( + '--learning-rate', + type=float, + default=0.1, + help="""\ + This is the inital learning rate value. The learning rate will decrease + during training. For more details check the model_fn implementation in + this file.\ + """) + parser.add_argument( + '--use-distortion-for-training', + type=bool, + default=True, + help='If doing image distortion for training.') + parser.add_argument( + '--sync', + action='store_true', + default=False, + help="""\ + If present when running in a distributed environment will run on sync mode.\ + """) + parser.add_argument( + '--num-intra-threads', + type=int, + default=0, + help="""\ + Number of threads to use for intra-op parallelism. When training on CPU + set to 0 to have the system pick the appropriate number or alternatively + set it to the number of physical CPU cores.\ + """) + parser.add_argument( + '--num-inter-threads', + type=int, + default=0, + help="""\ + Number of threads to use for inter-op parallelism. If set to 0, the + system will pick an appropriate number.\ + """) + parser.add_argument( + '--data-format', + type=str, + default=None, + help="""\ + If not set, the data format best for the training device is used. + Allowed values: channels_first (NCHW) channels_last (NHWC).\ + """) + parser.add_argument( + '--log-device-placement', + action='store_true', + default=False, + help='Whether to log device placement.') + parser.add_argument( + '--batch-norm-decay', + type=float, + default=0.997, + help='Decay for batch norm.') + parser.add_argument( + '--batch-norm-epsilon', + type=float, + default=1e-5, + help='Epsilon for batch norm.') + args = parser.parse_args() + + if args.num_gpus > 0: + assert tf.test.is_gpu_available(), "Requested GPUs but none found." + if args.num_gpus < 0: + raise ValueError( + 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') + if args.num_gpus == 0 and args.variable_strategy == 'GPU': + raise ValueError('num-gpus=0, CPU must be used as parameter server. Set' + '--variable-strategy=CPU.') + if (args.num_layers - 2) % 6 != 0: + raise ValueError('Invalid --num-layers parameter.') + if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: + raise ValueError('--train-batch-size must be multiple of --num-gpus.') + if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: + raise ValueError('--eval-batch-size must be multiple of --num-gpus.') + + main(**vars(args)) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_model.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_model.py new file mode 100644 index 00000000000..d67c233dbba --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_model.py @@ -0,0 +1,80 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Model class for Cifar10 Dataset.""" +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +import model_base + + +class ResNetCifar10(model_base.ResNet): + """Cifar10 model with ResNetV1 and basic residual block.""" + + def __init__(self, + num_layers, + is_training, + batch_norm_decay, + batch_norm_epsilon, + data_format='channels_first'): + super(ResNetCifar10, self).__init__( + is_training, + data_format, + batch_norm_decay, + batch_norm_epsilon + ) + self.n = (num_layers - 2) // 6 + # Add one in case label starts with 1. No impact if label starts with 0. + self.num_classes = 10 + 1 + self.filters = [16, 16, 32, 64] + self.strides = [1, 2, 2] + + def forward_pass(self, x, input_data_format='channels_last'): + """Build the core model within the graph.""" + if self._data_format != input_data_format: + if input_data_format == 'channels_last': + # Computation requires channels_first. + x = tf.transpose(x, [0, 3, 1, 2]) + else: + # Computation requires channels_last. + x = tf.transpose(x, [0, 2, 3, 1]) + + # Image standardization. + x = x / 128 - 1 + + x = self._conv(x, 3, 16, 1) + x = self._batch_norm(x) + x = self._relu(x) + + # Use basic (non-bottleneck) block and ResNet V1 (post-activation). + res_func = self._residual_v1 + + # 3 stages of block stacking. + for i in range(3): + with tf.name_scope('stage'): + for j in range(self.n): + if j == 0: + # First block in a stage, filters and strides may change. + x = res_func(x, 3, self.filters[i], self.filters[i + 1], + self.strides[i]) + else: + # Following blocks in a stage, constant filters and unit stride. + x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) + + x = self._global_avg_pool(x) + x = self._fully_connected(x, self.num_classes) + + return x diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_utils.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_utils.py new file mode 100644 index 00000000000..fca661e9a25 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cifar10_utils.py @@ -0,0 +1,154 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import collections +import six + +import tensorflow as tf + +from tensorflow.python.platform import tf_logging as logging +from tensorflow.core.framework import node_def_pb2 +from tensorflow.python.framework import device as pydev +from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import session_run_hook +from tensorflow.python.training import training_util +from tensorflow.python.training import device_setter +from tensorflow.contrib.learn.python.learn import run_config + + +# TODO(b/64848083) Remove once uid bug is fixed +class RunConfig(tf.contrib.learn.RunConfig): + def uid(self, whitelist=None): + """Generates a 'Unique Identifier' based on all internal fields. + Caller should use the uid string to check `RunConfig` instance integrity + in one session use, but should not rely on the implementation details, which + is subject to change. + Args: + whitelist: A list of the string names of the properties uid should not + include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which + includes most properties user allowes to change. + Returns: + A uid string. + """ + if whitelist is None: + whitelist = run_config._DEFAULT_UID_WHITE_LIST + + state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')} + # Pop out the keys in whitelist. + for k in whitelist: + state.pop('_' + k, None) + + ordered_state = collections.OrderedDict( + sorted(state.items(), key=lambda t: t[0])) + # For class instance without __repr__, some special cares are required. + # Otherwise, the object address will be used. + if '_cluster_spec' in ordered_state: + ordered_state['_cluster_spec'] = collections.OrderedDict( + sorted(ordered_state['_cluster_spec'].as_dict().items(), + key=lambda t: t[0]) + ) + return ', '.join( + '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) + + +class ExamplesPerSecondHook(session_run_hook.SessionRunHook): + """Hook to print out examples per second. + + Total time is tracked and then divided by the total number of steps + to get the average step time and then batch_size is used to determine + the running average of examples per second. The examples per second for the + most recent interval is also logged. + """ + + def __init__( + self, + batch_size, + every_n_steps=100, + every_n_secs=None,): + """Initializer for ExamplesPerSecondHook. + + Args: + batch_size: Total batch size used to calculate examples/second from + global time. + every_n_steps: Log stats every n steps. + every_n_secs: Log stats every n seconds. + """ + if (every_n_steps is None) == (every_n_secs is None): + raise ValueError('exactly one of every_n_steps' + ' and every_n_secs should be provided.') + self._timer = basic_session_run_hooks.SecondOrStepTimer( + every_steps=every_n_steps, every_secs=every_n_secs) + + self._step_train_time = 0 + self._total_steps = 0 + self._batch_size = batch_size + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError( + 'Global step should be created to use StepCounterHook.') + + def before_run(self, run_context): # pylint: disable=unused-argument + return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) + + def after_run(self, run_context, run_values): + _ = run_context + + global_step = run_values.results + if self._timer.should_trigger_for_step(global_step): + elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( + global_step) + if elapsed_time is not None: + steps_per_sec = elapsed_steps / elapsed_time + self._step_train_time += elapsed_time + self._total_steps += elapsed_steps + + average_examples_per_sec = self._batch_size * ( + self._total_steps / self._step_train_time) + current_examples_per_sec = steps_per_sec * self._batch_size + # Average examples/sec followed by current examples/sec + logging.info('%s: %g (%g), step = %g', 'Average examples/sec', + average_examples_per_sec, current_examples_per_sec, + self._total_steps) + +def local_device_setter(num_devices=1, + ps_device_type='cpu', + worker_device='/cpu:0', + ps_ops=None, + ps_strategy=None): + if ps_ops == None: + ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] + + if ps_strategy is None: + ps_strategy = device_setter._RoundRobinStrategy(num_devices) + if not six.callable(ps_strategy): + raise TypeError("ps_strategy must be callable") + + def _local_device_chooser(op): + current_device = pydev.DeviceSpec.from_string(op.device or "") + + node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def + if node_def.op in ps_ops: + ps_device_spec = pydev.DeviceSpec.from_string( + '/{}:{}'.format(ps_device_type, ps_strategy(op))) + + ps_device_spec.merge_from(current_device) + return ps_device_spec.to_string() + else: + worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") + worker_device_spec.merge_from(current_device) + return worker_device_spec.to_string() + return _local_device_chooser diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cmle_config.yaml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cmle_config.yaml new file mode 100644 index 00000000000..76f920534ef --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/cmle_config.yaml @@ -0,0 +1,6 @@ +trainingInput: + scaleTier: CUSTOM + masterType: complex_model_m_gpu + workerType: complex_model_m_gpu + parameterServerType: complex_model_m + workerCount: 1 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/generate_cifar10_tfrecords.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/generate_cifar10_tfrecords.py new file mode 100644 index 00000000000..409cee4eaec --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/generate_cifar10_tfrecords.py @@ -0,0 +1,114 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords. + +Generates tf.train.Example protos and writes them to TFRecord files from the +python version of the CIFAR-10 dataset downloaded from +https://www.cs.toronto.edu/~kriz/cifar.html. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +import tarfile +from six.moves import cPickle as pickle +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +CIFAR_FILENAME = 'cifar-10-python.tar.gz' +CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME +CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' + + +def download_and_extract(data_dir): + # download CIFAR-10 if not already downloaded. + tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, + CIFAR_DOWNLOAD_URL) + tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), + 'r:gz').extractall(data_dir) + + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) + + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + +def _get_file_names(): + """Returns the file names expected to exist in the input_dir.""" + file_names = {} + file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] + file_names['validation'] = ['data_batch_5'] + file_names['eval'] = ['test_batch'] + return file_names + + +def read_pickle_from_file(filename): + with tf.gfile.Open(filename, 'rb') as f: + data_dict = pickle.load(f) + return data_dict + + +def convert_to_tfrecord(input_files, output_file): + """Converts a file to TFRecords.""" + print('Generating %s' % output_file) + with tf.python_io.TFRecordWriter(output_file) as record_writer: + for input_file in input_files: + data_dict = read_pickle_from_file(input_file) + data = data_dict['data'] + labels = data_dict['labels'] + num_entries_in_batch = len(labels) + for i in range(num_entries_in_batch): + example = tf.train.Example(features=tf.train.Features( + feature={ + 'image': _bytes_feature(data[i].tobytes()), + 'label': _int64_feature(labels[i]) + })) + record_writer.write(example.SerializeToString()) + + +def main(data_dir): + print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) + download_and_extract(data_dir) + file_names = _get_file_names() + input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) + for mode, files in file_names.items(): + input_files = [os.path.join(input_dir, f) for f in files] + output_file = os.path.join(data_dir, mode + '.tfrecords') + try: + os.remove(output_file) + except OSError: + pass + # Convert to tf.train.Example and write the to TFRecords. + convert_to_tfrecord(input_files, output_file) + print('Done!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data-dir', + type=str, + default='', + help='Directory to download and extract CIFAR-10 to.') + + args = parser.parse_args() + main(args.data_dir) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/model_base.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/model_base.py new file mode 100644 index 00000000000..35e52b8355d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.6.0/model_base.py @@ -0,0 +1,219 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet model. + +Related papers: +https://arxiv.org/pdf/1603.05027v2.pdf +https://arxiv.org/pdf/1512.03385v1.pdf +https://arxiv.org/pdf/1605.07146v1.pdf +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +class ResNet(object): + """ResNet model.""" + + def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): + """ResNet constructor. + + Args: + is_training: if build training or inference model. + data_format: the data_format used during computation. + one of 'channels_first' or 'channels_last'. + """ + self._batch_norm_decay = batch_norm_decay + self._batch_norm_epsilon = batch_norm_epsilon + self._is_training = is_training + assert data_format in ('channels_first', 'channels_last') + self._data_format = data_format + + def forward_pass(self, x): + raise NotImplementedError( + 'forward_pass() is implemented in ResNet sub classes') + + def _residual_v1(self, + x, + kernel_size, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" + + del activate_before_residual + with tf.name_scope('residual_v1') as name_scope: + orig_x = x + + x = self._conv(x, kernel_size, out_filter, stride) + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, kernel_size, out_filter, 1) + x = self._batch_norm(x) + + if in_filter != out_filter: + orig_x = self._avg_pool(orig_x, stride, stride) + pad = (out_filter - in_filter) // 2 + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = self._relu(tf.add(x, orig_x)) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" + + with tf.name_scope('residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 3, out_filter, stride) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) + + if in_filter != out_filter: + pad = (out_filter - in_filter) // 2 + orig_x = self._avg_pool(orig_x, stride, stride) + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _bottleneck_residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" + + with tf.name_scope('bottle_residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + # pad when stride isn't unit + x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 1, out_filter, 1, is_atrous=True) + + if in_filter != out_filter: + orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _conv(self, x, kernel_size, filters, strides, is_atrous=False): + """Convolution.""" + + padding = 'SAME' + if not is_atrous and strides > 1: + pad = kernel_size - 1 + pad_beg = pad // 2 + pad_end = pad - pad_beg + if self._data_format == 'channels_first': + x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) + else: + x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + padding = 'VALID' + return tf.layers.conv2d( + inputs=x, + kernel_size=kernel_size, + filters=filters, + strides=strides, + padding=padding, + use_bias=False, + data_format=self._data_format) + + def _batch_norm(self, x): + if self._data_format == 'channels_first': + data_format = 'NCHW' + else: + data_format = 'NHWC' + return tf.contrib.layers.batch_norm( + x, + decay=self._batch_norm_decay, + center=True, + scale=True, + epsilon=self._batch_norm_epsilon, + is_training=self._is_training, + fused=True, + data_format=data_format) + + def _relu(self, x): + return tf.nn.relu(x) + + def _fully_connected(self, x, out_dim): + with tf.name_scope('fully_connected') as name_scope: + x = tf.layers.dense(x, out_dim) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _avg_pool(self, x, pool_size, stride): + with tf.name_scope('avg_pool') as name_scope: + x = tf.layers.average_pooling2d( + x, pool_size, stride, 'SAME', data_format=self._data_format) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _global_avg_pool(self, x): + with tf.name_scope('global_avg_pool') as name_scope: + assert x.get_shape().ndims == 4 + if self._data_format == 'channels_first': + x = tf.reduce_mean(x, [2, 3]) + else: + x = tf.reduce_mean(x, [1, 2]) + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md new file mode 100644 index 00000000000..66b4bc2d39e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md @@ -0,0 +1,542 @@ + + +(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) + +CIFAR-10 is a common benchmark in machine learning for image recognition. + +http://www.cs.toronto.edu/~kriz/cifar.html + +Code in this directory focuses on how to use TensorFlow Estimators to train and +evaluate a CIFAR-10 ResNet model on: + +* A single host with one CPU; +* A single host with multiple GPUs; +* Multiple hosts with CPU or multiple GPUs; + +Before trying to run the model we highly encourage you to read all the README. + +## Prerequisite + +1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or +later. + +2. Download the CIFAR-10 dataset and generate TFRecord files using the provided +script. The script and associated command below will download the CIFAR-10 +dataset and then generate a TFRecord for the training, validation, and +evaluation datasets. + +```shell +python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data +``` + +After running the command above, you should see the following files in the +--data-dir (```ls -R cifar-10-data```): + +* train.tfrecords +* validation.tfrecords +* eval.tfrecords + + +## Training on a single machine with GPUs or CPU + +Run the training on CPU only. After training, it runs the evaluation. + +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --num-gpus=0 \ + --train-steps=1000 +``` + +Run the model on 2 GPUs using CPU as parameter server. After training, it runs +the evaluation. +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --num-gpus=2 \ + --train-steps=1000 +``` + +Run the model on 2 GPUs using GPU as parameter server. +It will run an experiment, which for local setting basically means it will run +stop training +a couple of times to perform evaluation. + +``` +python cifar10_main.py --data-dir=${PWD}/cifar-10-data \ + --job-dir=/tmp/cifar10 \ + --variable-strategy GPU \ + --num-gpus=2 \ +``` + +There are more command line flags to play with; run +`python cifar10_main.py --help` for details. + +## Run distributed training + +### (Optional) Running on Google Cloud Machine Learning Engine + +This example can be run on Google Cloud Machine Learning Engine (ML Engine), +which will configure the environment and take care of running workers, +parameters servers, and masters in a fault tolerant way. + +To install the command line tool, and set up a project and billing, see the +quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line). + +You'll also need a Google Cloud Storage bucket for the data. If you followed the +instructions above, you can just run: + +``` +MY_BUCKET=gs:// +gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/ +``` + +Then run the following command from the `tutorials/image` directory of this +repository (the parent directory of this README): + +``` +gcloud ml-engine jobs submit training cifarmultigpu \ + --runtime-version 1.2 \ + --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \ + --config cifar10_estimator/cmle_config.yaml \ + --package-path cifar10_estimator/ \ + --module-name cifar10_estimator.cifar10_main \ + -- \ + --data-dir=$MY_BUCKET/cifar-10-data \ + --num-gpus=4 \ + --train-steps=1000 +``` + + +### Set TF_CONFIG + +Considering that you already have multiple hosts configured, all you need is a +`TF_CONFIG` environment variable on each host. You can set up the hosts manually +or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for +instructions about how to set up a Cluster. + +The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and +their task: `master`, `ps` or `worker`. + +Here's an example of `TF_CONFIG`. + +```python +cluster = {'master': ['master-ip:8000'], + 'ps': ['ps-ip:8000'], + 'worker': ['worker-ip:8000']} + +TF_CONFIG = json.dumps( + {'cluster': cluster, + 'task': {'type': master, 'index': 0}, + 'model_dir': 'gs:///', + 'environment': 'cloud' + }) +``` + +*Cluster* + +A cluster spec, which is basically a dictionary that describes all of the tasks +in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed). + +In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker. + +* `ps`: saves the parameters among all workers. All workers can + read/write/update the parameters for model via ps. As some models are + extremely large the parameters are shared among the ps (each ps stores a + subset). + +* `worker`: does the training. + +* `master`: basically a special worker, it does training, but also restores and + saves checkpoints and do evaluation. + +*Task* + +The Task defines what is the role of the current node, for this example the node +is the master on index 0 on the cluster spec, the task will be different for +each node. An example of the `TF_CONFIG` for a worker would be: + +```python +cluster = {'master': ['master-ip:8000'], + 'ps': ['ps-ip:8000'], + 'worker': ['worker-ip:8000']} + +TF_CONFIG = json.dumps( + {'cluster': cluster, + 'task': {'type': worker, 'index': 0}, + 'model_dir': 'gs:///', + 'environment': 'cloud' + }) +``` + +*Model_dir* + +This is the path where the master will save the checkpoints, graph and +TensorBoard files. For a multi host environment you may want to use a +Distributed File System, Google Storage and DFS are supported. + +*Environment* + +By the default environment is *local*, for a distributed setting we need to +change it to *cloud*. + +### Running script + +Once you have a `TF_CONFIG` configured properly on each host you're ready to run +on distributed settings. + +#### Master +Run this on master: +Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for +40000 steps. It will run evaluation a couple of times during training. The +num_workers arugument is used only to update the learning rate correctly. Make +sure the model_dir is the same as defined on the TF_CONFIG. + +```shell +python cifar10_main.py --data-dir=gs://path/cifar-10-data \ + --job-dir=gs://path/model_dir/ \ + --num-gpus=4 \ + --train-steps=40000 \ + --sync \ + --num-workers=2 +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} +... +2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:04.0 +Total memory: 11.17GiB +Free memory: 11.09GiB +2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:05.0 +Total memory: 11.17GiB +Free memory: 11.10GiB +... +2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1 +INFO:tensorflow:Create CheckpointSaverHook. +INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0 +2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config: +intra_op_parallelism_threads: 1 +gpu_options { + per_process_gpu_memory_fraction: 1 +} +allow_soft_placement: true + +INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt. +INFO:tensorflow:loss = 1.20682, step = 1 +INFO:tensorflow:loss = 1.20682, learning_rate = 0.1 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 +INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14 +2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0) +2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0) +2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0) +2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0) +2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0) +2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0) +2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0) +2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0) +INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023 +INFO:tensorflow:Evaluation [1/100] +INFO:tensorflow:Evaluation [2/100] +INFO:tensorflow:Evaluation [3/100] +INFO:tensorflow:Evaluation [4/100] +INFO:tensorflow:Evaluation [5/100] +INFO:tensorflow:Evaluation [6/100] +INFO:tensorflow:Evaluation [7/100] +INFO:tensorflow:Evaluation [8/100] +INFO:tensorflow:Evaluation [9/100] +INFO:tensorflow:Evaluation [10/100] +INFO:tensorflow:Evaluation [11/100] +INFO:tensorflow:Evaluation [12/100] +INFO:tensorflow:Evaluation [13/100] +... +INFO:tensorflow:Evaluation [100/100] +INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31 +INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425 +``` + +#### Worker + +Run this on worker: +Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for +40000 steps. It will run evaluation a couple of times during training. Make sure +the model_dir is the same as defined on the TF_CONFIG. + +```shell +python cifar10_main.py --data-dir=gs://path/cifar-10-data \ + --job-dir=gs://path/model_dir/ \ + --num-gpus=4 \ + --train-steps=40000 \ + --sync +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, +'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker', +'_is_chief': False, '_cluster_spec': +, +'_model_dir': 'gs:///model_dir/', +'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, +'_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, +'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 + } +... +2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:04.0 +Total memory: 11.17GiB +Free memory: 11.09GiB +2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: +name: Tesla K80 +major: 3 minor: 7 memoryClockRate (GHz) 0.8235 +pciBusID 0000:00:05.0 +Total memory: 11.17GiB +Free memory: 11.10GiB +... +2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8) +INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64) +INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11) +INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2 +INFO:tensorflow:Create CheckpointSaverHook. +2017-07-31 22:38:04.629150: I +tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting +for response from worker: /job:master/replica:0/task:0 +2017-07-31 22:38:09.263492: I +tensorflow/core/distributed_runtime/master_session.cc:999] Start master +session cc58f93b1e259b0c with config: +intra_op_parallelism_threads: 1 +gpu_options { +per_process_gpu_memory_fraction: 1 +} +allow_soft_placement: true +INFO:tensorflow:loss = 5.82382, step = 0 +INFO:tensorflow:loss = 5.82382, learning_rate = 0.8 +INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10 +INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20 +INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30 +INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40 +INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50 +INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60 +INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70 +INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec) +INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec) +INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80 +INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90 +INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100 +INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110 +INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120 +INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130 +INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140 +INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150 +INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160 +INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170 +INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec) +... +``` + +#### PS + +Run this on ps: +The ps will not do training so most of the arguments won't affect the execution + +```shell +python cifar10_main.py --job-dir=gs://path/model_dir/ +``` + +*Output:* + +```shell +INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/ +INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': , '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1 +gpu_options { +} +allow_soft_placement: true +, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'} +2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000} +2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} +2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000} +2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +``` + +## Visualizing results with TensorBoard + +When using Estimators you can also visualize your data in TensorBoard, with no +changes in your code. You can use TensorBoard to visualize your TensorFlow +graph, plot quantitative metrics about the execution of your graph, and show +additional data like images that pass through it. + +You'll see something similar to this if you "point" TensorBoard to the +`job dir` parameter you used to train or evaluate your model. + +Check TensorBoard during training or after it. Just point TensorBoard to the +model_dir you chose on the previous step. + +```shell +tensorboard --log-dir="" +``` + +## Warnings + +When runninng `cifar10_main.py` with `--sync` argument you may see an error +similar to: + +```python +File "cifar10_main.py", line 538, in + tf.app.run() +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run + _sys.exit(main(_sys.argv[:1] + flags_passthrough)) +File "cifar10_main.py", line 518, in main + hooks), run_config=config) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run + return _execute_schedule(experiment, schedule) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule + return task() +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate + hooks=self._eval_hooks) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate + hooks=hooks) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate + name=name) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model + features, labels, model_fn_lib.ModeKeys.EVAL) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn + features=features, labels=labels, **kwargs) +File "cifar10_main.py", line 331, in _resnet_model_fn + gradvars, global_step=tf.train.get_global_step()) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients + variables.global_variables()) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped + return _add_should_use_warning(fn(*args, **kwargs)) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning + wrapped = TFShouldUseWarningWrapper(x) +File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__ + stack = [s.strip() for s in traceback.format_stack()] +``` + +This should not affect your training, and should be fixed on the next releases. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py new file mode 100644 index 00000000000..6903e8d93de --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py @@ -0,0 +1,113 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""CIFAR-10 data set. + +See http://www.cs.toronto.edu/~kriz/cifar.html. +""" +import os + +import tensorflow as tf + +HEIGHT = 32 +WIDTH = 32 +DEPTH = 3 + + +class Cifar10DataSet(object): + """Cifar10 data set. + + Described by http://www.cs.toronto.edu/~kriz/cifar.html. + """ + + def __init__(self, data_dir, subset='train', use_distortion=True): + self.data_dir = data_dir + self.subset = subset + self.use_distortion = use_distortion + + def get_filenames(self): + if self.subset in ['train', 'validation', 'eval']: + return [os.path.join(self.data_dir, self.subset + '.tfrecords')] + else: + raise ValueError('Invalid data subset "%s"' % self.subset) + + def parser(self, serialized_example): + """Parses a single tf.Example into image and label tensors.""" + # Dimensions of the images in the CIFAR-10 dataset. + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + features = tf.parse_single_example( + serialized_example, + features={ + 'image': tf.FixedLenFeature([], tf.string), + 'label': tf.FixedLenFeature([], tf.int64), + }) + image = tf.decode_raw(features['image'], tf.uint8) + image.set_shape([DEPTH * HEIGHT * WIDTH]) + + # Reshape from [depth * height * width] to [depth, height, width]. + image = tf.cast( + tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), + tf.float32) + label = tf.cast(features['label'], tf.int32) + + # Custom preprocessing. + image = self.preprocess(image) + + return image, label + + def make_batch(self, batch_size): + """Read the images and labels from 'filenames'.""" + filenames = self.get_filenames() + # Repeat infinitely. + dataset = tf.data.TFRecordDataset(filenames).repeat() + + # Parse records. + dataset = dataset.map( + self.parser) + + # Potentially shuffle records. + if self.subset == 'train': + min_queue_examples = int( + Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) + # Ensure that the capacity is sufficiently large to provide good random + # shuffling. + dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) + + # Batch it up. + dataset = dataset.batch(batch_size) + iterator = dataset.make_one_shot_iterator() + image_batch, label_batch = iterator.get_next() + + return image_batch, label_batch + + def preprocess(self, image): + """Preprocess a single image in [height, width, depth] layout.""" + if self.subset == 'train' and self.use_distortion: + # Pad 4 pixels on each dimension of feature map, done in mini-batch + image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) + image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) + image = tf.image.random_flip_left_right(image) + return image + + @staticmethod + def num_examples_per_epoch(subset='train'): + if subset == 'train': + return 45000 + elif subset == 'validation': + return 5000 + elif subset == 'eval': + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py new file mode 100644 index 00000000000..51da6b94fa2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py @@ -0,0 +1,521 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet model for classifying images from CIFAR-10 dataset. + +Support single-host training with one or multiple devices. + +ResNet as proposed in: +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun +Deep Residual Learning for Image Recognition. arXiv:1512.03385 + +CIFAR-10 as in: +http://www.cs.toronto.edu/~kriz/cifar.html + + +""" +from __future__ import division +from __future__ import print_function + +import argparse +import functools +import itertools +import os + +import cifar10 +import cifar10_model +import cifar10_utils +import numpy as np +import six +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +tf.logging.set_verbosity(tf.logging.INFO) + + +def get_model_fn(num_gpus, variable_strategy, num_workers): + """Returns a function that will build the resnet model.""" + + def _resnet_model_fn(features, labels, mode, params): + """Resnet model body. + + Support single host, one or more GPU training. Parameter distribution can + be either one of the following scheme. + 1. CPU is the parameter server and manages gradient updates. + 2. Parameters are distributed evenly across all GPUs, and the first GPU + manages gradient updates. + + Args: + features: a list of tensors, one for each tower + labels: a list of tensors, one for each tower + mode: ModeKeys.TRAIN or EVAL + params: Hyperparameters suitable for tuning + Returns: + A EstimatorSpec object. + """ + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + weight_decay = params.weight_decay + momentum = params.momentum + + tower_features = features + tower_labels = labels + tower_losses = [] + tower_gradvars = [] + tower_preds = [] + + # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) + # on CPU. The exception is Intel MKL on CPU which is optimal with + # channels_last. + data_format = params.data_format + if not data_format: + if num_gpus == 0: + data_format = 'channels_last' + else: + data_format = 'channels_first' + + if num_gpus == 0: + num_devices = 1 + device_type = 'cpu' + else: + num_devices = num_gpus + device_type = 'gpu' + + for i in range(num_devices): + worker_device = '/{}:{}'.format(device_type, i) + if variable_strategy == 'CPU': + device_setter = cifar10_utils.local_device_setter( + worker_device=worker_device) + elif variable_strategy == 'GPU': + device_setter = cifar10_utils.local_device_setter( + ps_device_type='gpu', + worker_device=worker_device, + ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( + num_gpus, tf.contrib.training.byte_size_load_fn)) + with tf.variable_scope('resnet', reuse=bool(i != 0)): + with tf.name_scope('tower_%d' % i) as name_scope: + with tf.device(device_setter): + loss, gradvars, preds = _tower_fn( + is_training, weight_decay, tower_features[i], tower_labels[i], + data_format, params.num_layers, params.batch_norm_decay, + params.batch_norm_epsilon) + tower_losses.append(loss) + tower_gradvars.append(gradvars) + tower_preds.append(preds) + if i == 0: + # Only trigger batch_norm moving mean and variance update from + # the 1st tower. Ideally, we should grab the updates from all + # towers but these stats accumulate extremely fast so we can + # ignore the other stats from the other towers without + # significant detriment. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, + name_scope) + + # Now compute global loss and gradients. + gradvars = [] + with tf.name_scope('gradient_averaging'): + all_grads = {} + for grad, var in itertools.chain(*tower_gradvars): + if grad is not None: + all_grads.setdefault(var, []).append(grad) + for var, grads in six.iteritems(all_grads): + # Average gradients on the same device as the variables + # to which they apply. + with tf.device(var.device): + if len(grads) == 1: + avg_grad = grads[0] + else: + avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) + gradvars.append((avg_grad, var)) + + # Device that runs the ops to apply global gradient updates. + consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' + with tf.device(consolidation_device): + # Suggested learning rate scheduling from + # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 + num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( + 'train') // (params.train_batch_size * num_workers) + boundaries = [ + num_batches_per_epoch * x + for x in np.array([82, 123, 300], dtype=np.int64) + ] + staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] + + learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), + boundaries, staged_lr) + + loss = tf.reduce_mean(tower_losses, name='loss') + + examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( + params.train_batch_size, every_n_steps=10) + + tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} + + logging_hook = tf.train.LoggingTensorHook( + tensors=tensors_to_log, every_n_iter=100) + + train_hooks = [logging_hook, examples_sec_hook] + + optimizer = tf.train.MomentumOptimizer( + learning_rate=learning_rate, momentum=momentum) + + if params.sync: + optimizer = tf.train.SyncReplicasOptimizer( + optimizer, replicas_to_aggregate=num_workers) + sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) + train_hooks.append(sync_replicas_hook) + + # Create single grouped train op + train_op = [ + optimizer.apply_gradients( + gradvars, global_step=tf.train.get_global_step()) + ] + train_op.extend(update_ops) + train_op = tf.group(*train_op) + + predictions = { + 'classes': + tf.concat([p['classes'] for p in tower_preds], axis=0), + 'probabilities': + tf.concat([p['probabilities'] for p in tower_preds], axis=0) + } + stacked_labels = tf.concat(labels, axis=0) + metrics = { + 'accuracy': + tf.metrics.accuracy(stacked_labels, predictions['classes']) + } + + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + loss=loss, + train_op=train_op, + training_hooks=train_hooks, + eval_metric_ops=metrics) + + return _resnet_model_fn + + +def _tower_fn(is_training, weight_decay, feature, label, data_format, + num_layers, batch_norm_decay, batch_norm_epsilon): + """Build computation tower (Resnet). + + Args: + is_training: true if is training graph. + weight_decay: weight regularization strength, a float. + feature: a Tensor. + label: a Tensor. + data_format: channels_last (NHWC) or channels_first (NCHW). + num_layers: number of layers, an int. + batch_norm_decay: decay for batch normalization, a float. + batch_norm_epsilon: epsilon for batch normalization, a float. + + Returns: + A tuple with the loss for the tower, the gradients and parameters, and + predictions. + + """ + model = cifar10_model.ResNetCifar10( + num_layers, + batch_norm_decay=batch_norm_decay, + batch_norm_epsilon=batch_norm_epsilon, + is_training=is_training, + data_format=data_format) + logits = model.forward_pass(feature, input_data_format='channels_last') + tower_pred = { + 'classes': tf.argmax(input=logits, axis=1), + 'probabilities': tf.nn.softmax(logits) + } + + tower_loss = tf.losses.sparse_softmax_cross_entropy( + logits=logits, labels=label) + tower_loss = tf.reduce_mean(tower_loss) + + model_params = tf.trainable_variables() + tower_loss += weight_decay * tf.add_n( + [tf.nn.l2_loss(v) for v in model_params]) + + tower_grad = tf.gradients(tower_loss, model_params) + + return tower_loss, zip(tower_grad, model_params), tower_pred + + +def input_fn(data_dir, + subset, + num_shards, + batch_size, + use_distortion_for_training=True): + """Create input graph for model. + + Args: + data_dir: Directory where TFRecords representing the dataset are located. + subset: one of 'train', 'validate' and 'eval'. + num_shards: num of towers participating in data-parallel training. + batch_size: total batch size for training to be divided by the number of + shards. + use_distortion_for_training: True to use distortions. + Returns: + two lists of tensors for features and labels, each of num_shards length. + """ + with tf.device('/cpu:0'): + use_distortion = subset == 'train' and use_distortion_for_training + dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) + image_batch, label_batch = dataset.make_batch(batch_size) + if num_shards <= 1: + # No GPU available or only 1 GPU. + return [image_batch], [label_batch] + + # Note that passing num=batch_size is safe here, even though + # dataset.batch(batch_size) can, in some cases, return fewer than batch_size + # examples. This is because it does so only when repeating for a limited + # number of epochs, but our dataset repeats forever. + image_batch = tf.unstack(image_batch, num=batch_size, axis=0) + label_batch = tf.unstack(label_batch, num=batch_size, axis=0) + feature_shards = [[] for i in range(num_shards)] + label_shards = [[] for i in range(num_shards)] + for i in xrange(batch_size): + idx = i % num_shards + feature_shards[idx].append(image_batch[i]) + label_shards[idx].append(label_batch[i]) + feature_shards = [tf.parallel_stack(x) for x in feature_shards] + label_shards = [tf.parallel_stack(x) for x in label_shards] + return feature_shards, label_shards + + +def get_experiment_fn(data_dir, + num_gpus, + variable_strategy, + use_distortion_for_training=True): + """Returns an Experiment function. + + Experiments perform training on several workers in parallel, + in other words experiments know how to invoke train and eval in a sensible + fashion for distributed training. Arguments passed directly to this + function are not tunable, all other arguments should be passed within + tf.HParams, passed to the enclosed function. + + Args: + data_dir: str. Location of the data for input_fns. + num_gpus: int. Number of GPUs on each worker. + variable_strategy: String. CPU to use CPU as the parameter server + and GPU to use the GPUs as the parameter server. + use_distortion_for_training: bool. See cifar10.Cifar10DataSet. + Returns: + A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> + tf.contrib.learn.Experiment. + + Suitable for use by tf.contrib.learn.learn_runner, which will run various + methods on Experiment (train, evaluate) based on information + about the current runner in `run_config`. + """ + + def _experiment_fn(run_config, hparams): + """Returns an Experiment.""" + # Create estimator. + train_input_fn = functools.partial( + input_fn, + data_dir, + subset='train', + num_shards=num_gpus, + batch_size=hparams.train_batch_size, + use_distortion_for_training=use_distortion_for_training) + + eval_input_fn = functools.partial( + input_fn, + data_dir, + subset='eval', + batch_size=hparams.eval_batch_size, + num_shards=num_gpus) + + num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') + if num_eval_examples % hparams.eval_batch_size != 0: + raise ValueError( + 'validation set size must be multiple of eval_batch_size') + + train_steps = hparams.train_steps + eval_steps = num_eval_examples // hparams.eval_batch_size + + classifier = tf.estimator.Estimator( + model_fn=get_model_fn(num_gpus, variable_strategy, + run_config.num_worker_replicas or 1), + config=run_config, + params=hparams) + + # Create experiment. + return tf.contrib.learn.Experiment( + classifier, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + train_steps=train_steps, + eval_steps=eval_steps) + + return _experiment_fn + + +def main(job_dir, data_dir, num_gpus, variable_strategy, + use_distortion_for_training, log_device_placement, num_intra_threads, + **hparams): + # The env variable is on deprecation path, default is set to off. + os.environ['TF_SYNC_ON_FINISH'] = '0' + os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + + # Session configuration. + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=log_device_placement, + intra_op_parallelism_threads=num_intra_threads, + gpu_options=tf.GPUOptions(force_gpu_compatible=True)) + + config = cifar10_utils.RunConfig( + session_config=sess_config, model_dir=job_dir) + tf.contrib.learn.learn_runner.run( + get_experiment_fn(data_dir, num_gpus, variable_strategy, + use_distortion_for_training), + run_config=config, + hparams=tf.contrib.training.HParams( + is_chief=config.is_chief, + **hparams)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data-dir', + type=str, + required=True, + help='The directory where the CIFAR-10 input data is stored.') + parser.add_argument( + '--job-dir', + type=str, + required=True, + help='The directory where the model will be stored.') + parser.add_argument( + '--variable-strategy', + choices=['CPU', 'GPU'], + type=str, + default='CPU', + help='Where to locate variable operations') + parser.add_argument( + '--num-gpus', + type=int, + default=1, + help='The number of gpus used. Uses only CPU if set to 0.') + parser.add_argument( + '--num-layers', + type=int, + default=44, + help='The number of layers of the model.') + parser.add_argument( + '--train-steps', + type=int, + default=80000, + help='The number of steps to use for training.') + parser.add_argument( + '--train-batch-size', + type=int, + default=128, + help='Batch size for training.') + parser.add_argument( + '--eval-batch-size', + type=int, + default=100, + help='Batch size for validation.') + parser.add_argument( + '--momentum', + type=float, + default=0.9, + help='Momentum for MomentumOptimizer.') + parser.add_argument( + '--weight-decay', + type=float, + default=2e-4, + help='Weight decay for convolutions.') + parser.add_argument( + '--learning-rate', + type=float, + default=0.1, + help="""\ + This is the inital learning rate value. The learning rate will decrease + during training. For more details check the model_fn implementation in + this file.\ + """) + parser.add_argument( + '--use-distortion-for-training', + type=bool, + default=True, + help='If doing image distortion for training.') + parser.add_argument( + '--sync', + action='store_true', + default=False, + help="""\ + If present when running in a distributed environment will run on sync mode.\ + """) + parser.add_argument( + '--num-intra-threads', + type=int, + default=0, + help="""\ + Number of threads to use for intra-op parallelism. When training on CPU + set to 0 to have the system pick the appropriate number or alternatively + set it to the number of physical CPU cores.\ + """) + parser.add_argument( + '--num-inter-threads', + type=int, + default=0, + help="""\ + Number of threads to use for inter-op parallelism. If set to 0, the + system will pick an appropriate number.\ + """) + parser.add_argument( + '--data-format', + type=str, + default=None, + help="""\ + If not set, the data format best for the training device is used. + Allowed values: channels_first (NCHW) channels_last (NHWC).\ + """) + parser.add_argument( + '--log-device-placement', + action='store_true', + default=False, + help='Whether to log device placement.') + parser.add_argument( + '--batch-norm-decay', + type=float, + default=0.997, + help='Decay for batch norm.') + parser.add_argument( + '--batch-norm-epsilon', + type=float, + default=1e-5, + help='Epsilon for batch norm.') + args = parser.parse_args() + + if args.num_gpus > 0: + assert tf.test.is_gpu_available(), "Requested GPUs but none found." + if args.num_gpus < 0: + raise ValueError( + 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') + if args.num_gpus == 0 and args.variable_strategy == 'GPU': + raise ValueError('num-gpus=0, CPU must be used as parameter server. Set' + '--variable-strategy=CPU.') + if (args.num_layers - 2) % 6 != 0: + raise ValueError('Invalid --num-layers parameter.') + if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: + raise ValueError('--train-batch-size must be multiple of --num-gpus.') + if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: + raise ValueError('--eval-batch-size must be multiple of --num-gpus.') + + main(**vars(args)) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py new file mode 100644 index 00000000000..d67c233dbba --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py @@ -0,0 +1,80 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Model class for Cifar10 Dataset.""" +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +import model_base + + +class ResNetCifar10(model_base.ResNet): + """Cifar10 model with ResNetV1 and basic residual block.""" + + def __init__(self, + num_layers, + is_training, + batch_norm_decay, + batch_norm_epsilon, + data_format='channels_first'): + super(ResNetCifar10, self).__init__( + is_training, + data_format, + batch_norm_decay, + batch_norm_epsilon + ) + self.n = (num_layers - 2) // 6 + # Add one in case label starts with 1. No impact if label starts with 0. + self.num_classes = 10 + 1 + self.filters = [16, 16, 32, 64] + self.strides = [1, 2, 2] + + def forward_pass(self, x, input_data_format='channels_last'): + """Build the core model within the graph.""" + if self._data_format != input_data_format: + if input_data_format == 'channels_last': + # Computation requires channels_first. + x = tf.transpose(x, [0, 3, 1, 2]) + else: + # Computation requires channels_last. + x = tf.transpose(x, [0, 2, 3, 1]) + + # Image standardization. + x = x / 128 - 1 + + x = self._conv(x, 3, 16, 1) + x = self._batch_norm(x) + x = self._relu(x) + + # Use basic (non-bottleneck) block and ResNet V1 (post-activation). + res_func = self._residual_v1 + + # 3 stages of block stacking. + for i in range(3): + with tf.name_scope('stage'): + for j in range(self.n): + if j == 0: + # First block in a stage, filters and strides may change. + x = res_func(x, 3, self.filters[i], self.filters[i + 1], + self.strides[i]) + else: + # Following blocks in a stage, constant filters and unit stride. + x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) + + x = self._global_avg_pool(x) + x = self._fully_connected(x, self.num_classes) + + return x diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py new file mode 100644 index 00000000000..fca661e9a25 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py @@ -0,0 +1,154 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import collections +import six + +import tensorflow as tf + +from tensorflow.python.platform import tf_logging as logging +from tensorflow.core.framework import node_def_pb2 +from tensorflow.python.framework import device as pydev +from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import session_run_hook +from tensorflow.python.training import training_util +from tensorflow.python.training import device_setter +from tensorflow.contrib.learn.python.learn import run_config + + +# TODO(b/64848083) Remove once uid bug is fixed +class RunConfig(tf.contrib.learn.RunConfig): + def uid(self, whitelist=None): + """Generates a 'Unique Identifier' based on all internal fields. + Caller should use the uid string to check `RunConfig` instance integrity + in one session use, but should not rely on the implementation details, which + is subject to change. + Args: + whitelist: A list of the string names of the properties uid should not + include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which + includes most properties user allowes to change. + Returns: + A uid string. + """ + if whitelist is None: + whitelist = run_config._DEFAULT_UID_WHITE_LIST + + state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')} + # Pop out the keys in whitelist. + for k in whitelist: + state.pop('_' + k, None) + + ordered_state = collections.OrderedDict( + sorted(state.items(), key=lambda t: t[0])) + # For class instance without __repr__, some special cares are required. + # Otherwise, the object address will be used. + if '_cluster_spec' in ordered_state: + ordered_state['_cluster_spec'] = collections.OrderedDict( + sorted(ordered_state['_cluster_spec'].as_dict().items(), + key=lambda t: t[0]) + ) + return ', '.join( + '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) + + +class ExamplesPerSecondHook(session_run_hook.SessionRunHook): + """Hook to print out examples per second. + + Total time is tracked and then divided by the total number of steps + to get the average step time and then batch_size is used to determine + the running average of examples per second. The examples per second for the + most recent interval is also logged. + """ + + def __init__( + self, + batch_size, + every_n_steps=100, + every_n_secs=None,): + """Initializer for ExamplesPerSecondHook. + + Args: + batch_size: Total batch size used to calculate examples/second from + global time. + every_n_steps: Log stats every n steps. + every_n_secs: Log stats every n seconds. + """ + if (every_n_steps is None) == (every_n_secs is None): + raise ValueError('exactly one of every_n_steps' + ' and every_n_secs should be provided.') + self._timer = basic_session_run_hooks.SecondOrStepTimer( + every_steps=every_n_steps, every_secs=every_n_secs) + + self._step_train_time = 0 + self._total_steps = 0 + self._batch_size = batch_size + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError( + 'Global step should be created to use StepCounterHook.') + + def before_run(self, run_context): # pylint: disable=unused-argument + return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) + + def after_run(self, run_context, run_values): + _ = run_context + + global_step = run_values.results + if self._timer.should_trigger_for_step(global_step): + elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( + global_step) + if elapsed_time is not None: + steps_per_sec = elapsed_steps / elapsed_time + self._step_train_time += elapsed_time + self._total_steps += elapsed_steps + + average_examples_per_sec = self._batch_size * ( + self._total_steps / self._step_train_time) + current_examples_per_sec = steps_per_sec * self._batch_size + # Average examples/sec followed by current examples/sec + logging.info('%s: %g (%g), step = %g', 'Average examples/sec', + average_examples_per_sec, current_examples_per_sec, + self._total_steps) + +def local_device_setter(num_devices=1, + ps_device_type='cpu', + worker_device='/cpu:0', + ps_ops=None, + ps_strategy=None): + if ps_ops == None: + ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] + + if ps_strategy is None: + ps_strategy = device_setter._RoundRobinStrategy(num_devices) + if not six.callable(ps_strategy): + raise TypeError("ps_strategy must be callable") + + def _local_device_chooser(op): + current_device = pydev.DeviceSpec.from_string(op.device or "") + + node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def + if node_def.op in ps_ops: + ps_device_spec = pydev.DeviceSpec.from_string( + '/{}:{}'.format(ps_device_type, ps_strategy(op))) + + ps_device_spec.merge_from(current_device) + return ps_device_spec.to_string() + else: + worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") + worker_device_spec.merge_from(current_device) + return worker_device_spec.to_string() + return _local_device_chooser diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cmle_config.yaml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cmle_config.yaml new file mode 100644 index 00000000000..76f920534ef --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cmle_config.yaml @@ -0,0 +1,6 @@ +trainingInput: + scaleTier: CUSTOM + masterType: complex_model_m_gpu + workerType: complex_model_m_gpu + parameterServerType: complex_model_m + workerCount: 1 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py new file mode 100644 index 00000000000..409cee4eaec --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py @@ -0,0 +1,114 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords. + +Generates tf.train.Example protos and writes them to TFRecord files from the +python version of the CIFAR-10 dataset downloaded from +https://www.cs.toronto.edu/~kriz/cifar.html. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +import tarfile +from six.moves import cPickle as pickle +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +CIFAR_FILENAME = 'cifar-10-python.tar.gz' +CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME +CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' + + +def download_and_extract(data_dir): + # download CIFAR-10 if not already downloaded. + tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, + CIFAR_DOWNLOAD_URL) + tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), + 'r:gz').extractall(data_dir) + + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) + + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + +def _get_file_names(): + """Returns the file names expected to exist in the input_dir.""" + file_names = {} + file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] + file_names['validation'] = ['data_batch_5'] + file_names['eval'] = ['test_batch'] + return file_names + + +def read_pickle_from_file(filename): + with tf.gfile.Open(filename, 'rb') as f: + data_dict = pickle.load(f) + return data_dict + + +def convert_to_tfrecord(input_files, output_file): + """Converts a file to TFRecords.""" + print('Generating %s' % output_file) + with tf.python_io.TFRecordWriter(output_file) as record_writer: + for input_file in input_files: + data_dict = read_pickle_from_file(input_file) + data = data_dict['data'] + labels = data_dict['labels'] + num_entries_in_batch = len(labels) + for i in range(num_entries_in_batch): + example = tf.train.Example(features=tf.train.Features( + feature={ + 'image': _bytes_feature(data[i].tobytes()), + 'label': _int64_feature(labels[i]) + })) + record_writer.write(example.SerializeToString()) + + +def main(data_dir): + print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) + download_and_extract(data_dir) + file_names = _get_file_names() + input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) + for mode, files in file_names.items(): + input_files = [os.path.join(input_dir, f) for f in files] + output_file = os.path.join(data_dir, mode + '.tfrecords') + try: + os.remove(output_file) + except OSError: + pass + # Convert to tf.train.Example and write the to TFRecords. + convert_to_tfrecord(input_files, output_file) + print('Done!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data-dir', + type=str, + default='', + help='Directory to download and extract CIFAR-10 to.') + + args = parser.parse_args() + main(args.data_dir) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py new file mode 100644 index 00000000000..35e52b8355d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/dockerfile/with-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py @@ -0,0 +1,219 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet model. + +Related papers: +https://arxiv.org/pdf/1603.05027v2.pdf +https://arxiv.org/pdf/1512.03385v1.pdf +https://arxiv.org/pdf/1605.07146v1.pdf +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +class ResNet(object): + """ResNet model.""" + + def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): + """ResNet constructor. + + Args: + is_training: if build training or inference model. + data_format: the data_format used during computation. + one of 'channels_first' or 'channels_last'. + """ + self._batch_norm_decay = batch_norm_decay + self._batch_norm_epsilon = batch_norm_epsilon + self._is_training = is_training + assert data_format in ('channels_first', 'channels_last') + self._data_format = data_format + + def forward_pass(self, x): + raise NotImplementedError( + 'forward_pass() is implemented in ResNet sub classes') + + def _residual_v1(self, + x, + kernel_size, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" + + del activate_before_residual + with tf.name_scope('residual_v1') as name_scope: + orig_x = x + + x = self._conv(x, kernel_size, out_filter, stride) + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, kernel_size, out_filter, 1) + x = self._batch_norm(x) + + if in_filter != out_filter: + orig_x = self._avg_pool(orig_x, stride, stride) + pad = (out_filter - in_filter) // 2 + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = self._relu(tf.add(x, orig_x)) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" + + with tf.name_scope('residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 3, out_filter, stride) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) + + if in_filter != out_filter: + pad = (out_filter - in_filter) // 2 + orig_x = self._avg_pool(orig_x, stride, stride) + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _bottleneck_residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" + + with tf.name_scope('bottle_residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + # pad when stride isn't unit + x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 1, out_filter, 1, is_atrous=True) + + if in_filter != out_filter: + orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _conv(self, x, kernel_size, filters, strides, is_atrous=False): + """Convolution.""" + + padding = 'SAME' + if not is_atrous and strides > 1: + pad = kernel_size - 1 + pad_beg = pad // 2 + pad_end = pad - pad_beg + if self._data_format == 'channels_first': + x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) + else: + x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + padding = 'VALID' + return tf.layers.conv2d( + inputs=x, + kernel_size=kernel_size, + filters=filters, + strides=strides, + padding=padding, + use_bias=False, + data_format=self._data_format) + + def _batch_norm(self, x): + if self._data_format == 'channels_first': + data_format = 'NCHW' + else: + data_format = 'NHWC' + return tf.contrib.layers.batch_norm( + x, + decay=self._batch_norm_decay, + center=True, + scale=True, + epsilon=self._batch_norm_epsilon, + is_training=self._is_training, + fused=True, + data_format=data_format) + + def _relu(self, x): + return tf.nn.relu(x) + + def _fully_connected(self, x, out_dim): + with tf.name_scope('fully_connected') as name_scope: + x = tf.layers.dense(x, out_dim) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _avg_pool(self, x, pool_size, stride): + with tf.name_scope('avg_pool') as name_scope: + x = tf.layers.average_pooling2d( + x, pool_size, stride, 'SAME', data_format=self._data_format) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _global_avg_pool(self, x): + with tf.name_scope('global_avg_pool') as name_scope: + assert x.get_shape().ndims == 4 + if self._data_format == 'channels_first': + x = tf.reduce_mean(x, [2, 3]) + else: + x = tf.reduce_mean(x, [1, 2]) + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json new file mode 100644 index 00000000000..021312192de --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_distributed_tf_job_spec.json @@ -0,0 +1,66 @@ + +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=2", + "number_of_containers": 1 + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=2", + "number_of_containers": 1 + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048" + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=0", + "number_of_containers": 1 + }, + { + "name": "tensorboard", + "dependencies": [], + "resource": { + "cpus": 4, + "memory": "8192" + }, + "launch_command": "export LC_ALL=C && tensorboard --logdir=hdfs://default/tmp/cifar-10-jobdir", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json new file mode 100644 index 00000000000..fdf8c0fa947 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/example_standalone_tf_job_spec.json @@ -0,0 +1,41 @@ + +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=1", + "number_of_containers": 1 + }, + { + "name": "tensorboard", + "dependencies": [], + "resource": { + "cpus": 4, + "memory": "8192" + }, + "launch_command": "export LC_ALL=C && tensorboard --logdir=hdfs://default/tmp/cifar-10-jobdir", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template new file mode 100644 index 00000000000..da4b84dacce --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/presetup-tf.sh_template @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assumptions: +# 1. java is assumed to be installed under /usr/lib/jvm/java-8-openjdk-amd64 +# 2. Using bash shell + +export HADOOP_HDFS_HOME= + +# Intentionally leave HADOOP_HOME and HADOOP_YARN_HOME empty +export HADOOP_HOME= +export HADOOP_YARN_HOME= + +# Setup $HADOOP_CONF_DIR, by default submit_tf_job.py mounts cluster +# Hadoop configs to /etc/hadoop/conf +export HADOOP_CONF_DIR=/etc/hadoop/conf + +export JAVA_HOME= +export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`:$HADOOP_CONF_DIR +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py new file mode 100644 index 00000000000..e5da94219dd --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/main/tensorflow/scripts/submit_tf_job.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import json +import os + + +def get_component_array(name, count, hostname_suffix): + component = '\\"' + name + '\\":' + component_names = '[' + for i in xrange(0, count): + component_names = component_names + '\\' + '"' + name + "-" + str( + i) + hostname_suffix + '\\"' + if i != count - 1: + component_names = component_names + ',' + component_names = component_names + ']' + return component + component_names + + +def get_key_value_pair(name, keys, values, count): + block_name = '\\"' + name + '\\":' + block_values = '' + if count == 1: + block_values = block_values + '\\' + '"' + values[0] + '\\"' + return block_name + block_values + block_values = '{' + for i in xrange(0, count): + block_values = block_values + '\\' + '"' + keys[i] + '\\"' + ':' + \ + values[i] + if i != count - 1: + block_values = block_values + ',' + block_values = block_values + '}' + return block_name + block_values + +def handle_distributed_tf_config_env(tf_json, username, domain): + num_worker = -1 + num_ps = -1 + num_master = -1 + has_tensorboard = False + + # Do we need to generate tf_config? First get unique component names + for c in tf_json['components']: + name = c['name'] + if name == 'worker': + num_worker = int(c['number_of_containers']) + elif name == 'ps': + num_ps = int(c['number_of_containers']) + elif name == 'master': + num_master = int(c['number_of_containers']) + elif name == 'tensorboard': + has_tensorboard = True + + + if num_worker < 0 or num_ps < 0 or num_master != 1: + return + + print "Detected workers / ps / master, generating TF_CONFIG automatically..." + + if username is None or username == '': + raise Exception("Empty username specified, please double check") + if domain is None or domain == '': + raise Exception("Empty domain name specified, please double check") + + tensorflow_common_prefix = "." + tf_json[ + 'name'] + "." + username + "." + domain + hostname_suffix = tensorflow_common_prefix + ":8000" + cluster = '{' + '\\"cluster' + '\\":{' + master = get_component_array("master", 1, hostname_suffix) + "," + ps = get_component_array("ps", num_ps, hostname_suffix) + "," + worker = get_component_array("worker", num_worker, hostname_suffix) + "}," + component_name = '\\"' + "${COMPONENT_NAME}" + '\\"' + component_id = "${COMPONENT_ID}" + task = get_key_value_pair("task", ["type", "index"], + [component_name, component_id], 2) + "," + environment = get_key_value_pair("environment", "", ["cloud"], 1) + "}" + tf_config_op = cluster + master + ps + worker + task + environment + if "configuration" not in tf_json: + tf_json['configuration'] = {'env': { } } + if "env" not in tf_json['configuration']: + tf_json['configuration']['env'] = {} + tf_json['configuration']['env']['TF_CONFIG'] = tf_config_op + + if has_tensorboard: + tensorboard_link = "http://tensorboard-0" + tensorflow_common_prefix + ":6006" + print "Tensorboard will be available at: ", tensorboard_link + # added to quicklink + if "quicklinks" not in tf_json: + tf_json["quicklinks"] = { "Tensorboard": tensorboard_link } + else: + tf_json["quicklinks"]["Tensorboard"] = tensorboard_link + +if __name__ == "__main__": + # Instantiate the parser + parser = argparse.ArgumentParser( + description='Submit Tensorflow job to YARN.') + + # Required positional argument + parser.add_argument('--remote_conf_path', type=str, + help='Remote Configuration path to run TF job' + ' should include core-site.xml/hdfs-site.xml' + '/presetup-tf.sh, etc. By default it uses hdfs:///etc/tf-configs', + required=False) + parser.add_argument('--input_spec', type=str, + help='Yarnfile specification for TF job.', + required=True) + parser.add_argument('--docker_image', type=str, + help='Docker image name for TF job.', required=False) + parser.add_argument('--env', type=str, + help='Environment variables needed for TF job in' + ' key=value format.', + required=False) + parser.add_argument('--dry_run', action='store_true', + help='When this is not specified (default behavior), ' + 'YARN service will be automatically submitted. ' + 'When this is specified, generated YARN service' + ' spec will be printed to stdout') + parser.add_argument('--job_name', type=str, + help='Specify job name of the Tensorflow job, which ' + 'will overwrite the one specified in input spec ' + 'file', + required=False) + parser.add_argument('--user', type=str, + help='Specify user name if it is different from $USER ' + '(e.g. kinit user)', + required=False) + parser.add_argument('--domain', type=str, + help='Cluster domain name, which should be same as ' + 'hadoop.registry.dns.domain-name in yarn-site.xml' + ', required for distributed Tensorflow', + required=False) + parser.add_argument('--kerberos', action='store_true', + help='Is this a kerberos-enabled cluster or not') + parser.add_argument('--verbose', action='store_true', + help='Print debug information') + args = parser.parse_args() + + verbose = args.verbose + + if hasattr(args, "remote_conf_path") and args.remote_conf_path is not None and args.remote_conf_path != '': + remote_path = args.remote_conf_path + else: + remote_path = "hdfs:///etc/tf-configs" + if verbose: + print "Using ", remote_conf_path, " as --remote_conf_path" + + input_json_spec = args.input_spec + do_dry_run = args.dry_run + envs_array = [] + if hasattr(args, 'env'): + envs = args.env + if envs is not None: + envs_array = envs.split(',') + if hasattr(args, 'user'): + user = args.user + if user is None: + user = os.environ['USER'] + if hasattr(args, 'domain'): + domain = args.domain + + if domain is None or domain == '': + # Hard coded logic + domain = "hwxgpu.site" + if hasattr(args, 'job_name'): + job_name = args.job_name + docker_image = None + if hasattr(args, 'docker_image'): + docker_image = args.docker_image + kerberos = args.kerberos + + # Only print when verbose + if verbose: + print "remote_path=", remote_path + print "input_spec_file=", input_json_spec + print "do_dry_run=", do_dry_run + print "user=", user + + with open(input_json_spec) as json_file: + data = json_file.read() + tf_json = json.loads(data) + + if job_name is not None: + tf_json['name'] = job_name + else: + # Otherwise, read from json file + job_name = tf_json['name'] + + # Updating per-component commands with presetup-tf.sh + for component in tf_json['components']: + # Append presetup-tf.sh to launch command + launch_cmd = '. presetup-tf.sh && ' + component['launch_command'] + component['launch_command'] = launch_cmd + + if verbose: + print "New launch command = ", launch_cmd + + if docker_image is not None and len(docker_image) > 0: + artifact = component.get('artifact') + if artifact is None or artifact.get('id') is None: + component['artifact'] = {} + component['artifact']['id'] = docker_image + component['artifact']['type'] = "DOCKER" + + if verbose: + print "Using docker image=", docker_image + + artifact = component.get('artifact') + if artifact is None or artifact.get('id') is None: + raise Exception("Docker image for components doesn't set, please" + " either set it in input spec or by passing " + "--docker-image commandline") + + # handle TF_CONFIG if needed + handle_distributed_tf_config_env(tf_json, user, domain) + + # Update conf files to mount in files section. + spec_envs = tf_json['configuration']['env'] + docker_mounts = '' + + if spec_envs is not None and \ + spec_envs.get('YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS') is not None: + docker_mounts = spec_envs['YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS'] + + srcfiles, destfiles = [], [] + srcfiles.append(remote_path + '/core-site.xml') + srcfiles.append(remote_path + '/hdfs-site.xml') + srcfiles.append(remote_path + '/presetup-tf.sh') + destfiles.append("core-site.xml") + destfiles.append("hdfs-site.xml") + destfiles.append("presetup-tf.sh") + + if len(docker_mounts) > 0: + docker_mounts = docker_mounts + "," + docker_mounts = docker_mounts + \ + "core-site.xml:/etc/hadoop/conf/core-site.xml:ro," \ + "hdfs-site.xml:/etc/hadoop/conf/hdfs-site.xml:ro" + + if kerberos: + srcfiles.append(remote_path + '/krb5.conf') + destfiles.append('krb5.conf') + docker_mounts = docker_mounts + ",krb5.conf:/etc/krb5.conf:ro" + + docker_mounts = docker_mounts + ",/etc/passwd:/etc/passwd:ro" + \ + ",/etc/group:/etc/group:ro,/etc/docker_resolv.conf:/etc/resolv.conf:ro" + file_envs = [{"type": "STATIC", "dest_file": d, "src_file": s} for d, s in + zip(destfiles, srcfiles)] + tf_json['configuration']['files'] = file_envs + + envs_array.append('YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS=' + docker_mounts) + + # Fetch all envs passed and update in common configuration section + for env in envs_array: + if verbose: + print "Setting env=", env + key_value = env.split('=') + tf_json['configuration']['env'][key_value[0]] = key_value[1] + + jstr = json.dumps(tf_json, sort_keys=False, indent=2) + + if verbose: + print ("============= Begin of generated YARN file ==============") + print(jstr) + print ("============= End of generated YARN file ==============") + + # submit to YARN + if do_dry_run: + print("Skip submit job to YARN.") + else: + print("Submitting job to YARN.") + filename = "/tmp/tensor-flow-yarn-spec-" + user + "-" + str(time.time()) + ".json" + f = open(filename, "w") + f.write(jstr) + f.close() + cmd = "yarn app -launch " + job_name + " " + filename + print("Executing '" + cmd + "'") + os.system("yarn app -launch " + job_name + " " + filename) \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md new file mode 100644 index 00000000000..6758c05b41e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Dockerfile.md @@ -0,0 +1,105 @@ + + +# Creating Docker Images for Running Tensorflow on YARN + +## How to create docker images to run Tensorflow on YARN + +Dockerfile to run Tensorflow on YARN need two part: + +**Base libraries which Tensorflow depends on** + +1) OS base image, for example ```ubuntu:16.04``` + +2) Tensorflow depended libraries and packages. For example ```python```, ```scipy```. For GPU support, need ```cuda```, ```cudnn```, etc. + +3) Tensorflow package. + +**Libraries to access HDFS** + +1) JDK + +2) Hadoop + +Here's an example of a base image (w/o GPU support) to install Tensorflow: +``` +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl +``` + +On top of above image, add files, install packages to access HDFS +``` +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz +``` + +Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step. + +## Use examples to build your own Tensorflow docker images + +We provided following examples for you to build tensorflow docker images. + +For Tensorflow 1.3.0 (Precompiled to CUDA 8.x) + +- *base/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0*: Tensorflow 1.3.0 supports CPU only. +- *with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.3.0*: Tensorflow 1.3.0 supports CPU only, and included models +- *base/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0*: Tensorflow 1.3.0 supports GPU, which is prebuilt to CUDA8. +- *with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.3.0*: Tensorflow 1.3.0 supports GPU, which is prebuilt to CUDA8, with models. + +For Tensorflow 1.8.0 (Precompiled to CUDA 9.x) + +- *base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only. +- *with-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models +- *base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9. +- *with-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md new file mode 100644 index 00000000000..6fe90290b34 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/Overview.md @@ -0,0 +1,25 @@ + + +# Deep Learning Frameworks Running on YARN + +## Contents +Yarn Service framework provides first class support to host long running services natively in YARN. This provides an easier way for users to run Deep Learning workloads on top of YARN. This document summarizes steps and utilities available in YARN to ease the complexities in running Machine Learning workloads. Few such supported Deep Learning frameworks are as follows: + +### Running Tensorflow on YARN + * [Tutorials: Running Tensorflow on YARN](TensorflowOnYarnTutorial.html) + * [How to write Dockerfile to run Tensorflow on YARN](Dockerfile.html) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md new file mode 100644 index 00000000000..3e635e165d7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingHelperScript.md @@ -0,0 +1,237 @@ + + +# Run Tensorflow Jobs Using Helper Script + +## Prerequisites +1) Sufficient permissions (authorization and authentication, like `kinit`) are needed for the user (in case of secure cluster) to run `submit_tf_job.py` script as it also supports submitting Tensorflow service directly to YARN. + +2) User could be use `--input_spec` argument to specify a sample spec file as a template. This package has `example_tf_job_spec.json` sample spec file and user could edit this file to specify resources per component (such as memory, cpu, gpu etc) and kerberos specification if needed. + +## Setup presetup-tf.sh template +1) Rename `presetup-tf.sh_template` to `presetup-tf.sh` + +2) In `presetup-tf.sh` + + - Update valid `HADOOP_HDFS_HOME`. This should point to `HADOOP_HDFS_HOME` **inside the docker image**. + + - Update `JAVA_HOME` as per the environment setup. This should point to `JAVA_HOME` **inside the docker image**. + +3) Place `presetup-tf.sh` in HDFS under `hdfs://host:port//`. + +4) Ensure that `` is accessible with correct permission for user. + +5) Upload core-site.xml, hdfs-site.xml to ``. + +6) (when security is enabled) Upload krb5.conf to ``. + +## Run `submit_tf_job.py` to submit Tensorflow job to YARN + +User could run below command to submit Tensorflow job to YARN or to generate valid Yarnfile for the job. + +`python submit_tf_job.py --remote_conf_path --input_spec --docker_image --env --job_name --user --domain --distributed --kerberos` + +Detailed argument summary for `submit_tf_job.py` command. + +``` +optional arguments: + -h, --help show this help message and exit + --remote_conf_path REMOTE_CONF_PATH + Remote Configuration path to run TF job should include + core-site.xml/hdfs-site.xml/presetup-tf.sh, etc. By + default it uses hdfs:///etc/tf-configs + --input_spec INPUT_SPEC + Yarnfile specification for TF job. + --docker_image DOCKER_IMAGE + Docker image name for TF job. + --env ENV Environment variables needed for TF job in key=value + format. + --dry_run When this is not specified (default behavior), YARN + service will be automatically submitted. When this is + specified, generated YARN service spec will be printed + to stdout + --job_name JOB_NAME Specify job name of the Tensorflow job, which will + overwrite the one specified in input spec file + --user USER Specify user name if it is different from $USER (e.g. + kinit user) + --domain DOMAIN Cluster domain name, which should be same as + hadoop.registry.dns.domain-name in yarn-site.xml, + required for distributed Tensorflow + --kerberos Is this a kerberos-enabled cluster or not + --verbose Print debug information +``` + +Example: +`python submit_tf_job.py --input_spec example_tf_job_spec.json --docker_image tf-gpu:ubuntu-xyz --job_name distributed-tf --user ambari-qa --domain --remote_conf_path hdfs:///tf-job-conf/configs` + +## Provide `input-spec` file to run Tensorflow jobs + +### Run standalone TF job. + +``` + +{ + "name": "standalone-tf", + "version": "1.0.0", + "components": [ + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} + +``` + +Notes: + +- `hdfs://host:port//presetup-tf.sh` will be automatically downloaded and mounted to the docker container, it will be executed before invoking `launch_command` of components specified in the spec. +- Component name can be customized (In above example it uses `worker`) +- In `resource` section, you can specify cpu/memory/gpu if you needed. +- Additional environment variables can be specified under `env`. This will be passed to launched docker container process. + +### Run distributed TF job. + +``` +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=1 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1 + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048", + "additional" : { + "yarn.io/gpu" : { + "value" : 1 + } + } + }, + "launch_command": "cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=1", + "number_of_containers": 1 + } + ], + "configuration": { + "properties": {}, + "env": { + } + }, + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} +``` + +Notes: (In addition to standalone TF spec) + +- For distributed Tensorflow launch spec, `master`, `worker`, `ps` components are mandatory. +- Different value of `num_of_containers` can be specified for `worker` and `ps`. +- `TF_CONFIG` will be automatically generated and insert to spec to launch the job to do distributed training, you don't need to worry about this. + +### Addition information about writing Yarnfile + +#### Security + +YARN Native Services provide an easy way to run services in a secure cluster by providing keytabs and kerberos principal as simple json input parameters. +``` +"kerberos_principal" : { + "principal_name" : , + "keytab" : +} +``` +User could define the kerberos principal name (eg: test-user@EXAMPLE.COM) and keytab file path from HDFS or local file system. Given these information, user could run service from a secured shell. + +#### Choosing correct Docker images for the application +``` +"artifact" : { + "id" : , + "type" : "DOCKER" +} +``` +Under each service component, user need to provide docker image name so that native service will use this image to launch containers. +User can use `--docker_image` to overwrite whatever defined in the input job spec. + +## General Guidelines + +1) For many env configurations, extra escape characters are used so that native service could export correct env's as per guideline. An improvement to this is ongoing in YARN-8257. + +2) In secure clusters, user need to ensure that app is launched from a secure shell. (e.g. with proper Kerberos token). + +## End-to-end example: + +### Run Cifar10 distributed Tensorflow training on GPU/security-enabled cluster + +#### Launch Command +``` +python submit_tf_job.py --remote_conf_path hdfs:///tf-job-conf --input_spec example_tf_job_spec.json --docker_image gpu.cuda_9.0.tf_1.8.0 --job_name distributed-tf-gpu --user tf-user --domain tensorflow.site --kerberos +``` + +- `docker_image` file could be found under `tensorflow/dockerfile/with-models/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0` from Hadoop codebase and we assume docker image is created named as `gpu.cuda_9.0.tf_1.8.0` from this file. +- `input_spec` file could be found under `tensorflow/scripts/example_tf_job_spec.json` from Hadoop codebase and make the necessary edits as needed. \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md new file mode 100644 index 00000000000..36c6fd3fc54 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/RunTensorflowJobUsingNativeServiceSpec.md @@ -0,0 +1,247 @@ + + +## Using raw YARN native service spec to run job + +Yarnfile is a normal JSON file, typically you should save the Yarnfile to a local file and use following command to run: + +``` +yarn app -launch distributed-tf +``` + +Or you can use curl to post the Yarnfile. + +``` +hadoop fs -rmr /tmp/cifar-10-jobdir; +yarn application -destroy distributed-tf; +curl --negotiate -u: -H "Content-Type: application/json" \ + -X POST http://:8088/app/v1/services -d '... content of Yarnfile...' +``` + +## Example Yarnfiles + +### Generate Dockerfiles + +Please refer to [Dockerfile for running on Tensorflow on YARN](Dockerfile.html) for more details. + +### Single node Tensorflow (with GPU and access Kerberorized HDFS) + +``` +{ + "name": "single-node-tensorflow", + "version": "1.0.0", + "components": [ + { + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096", + "additional" : { + "yarn.io/gpu" : { + "value" : 2 + } + } + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export HADOOP_CONF_DIR=/etc/hadoop/conf; export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/cifar10_estimator && ls -l && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=1 --train-batch-size=16 --train-steps=40000", + "number_of_containers": 1, + "run_privileged_container": false + } + ], + "kerberos_principal" : { + "principal_name" : "test-user@EXAMPLE.COM", + "keytab" : "file:///etc/security/keytabs/test-user.headless.keytab" + } +} +``` + +### Distributed Tensorflow (with CPU and access non-Kerberorized HDFS) + +#### Generate TF_CONFIG + +```TF_CONFIG``` is an environment variable which passes training parameters to Tensorflow. It is widely used to run distributed Tensorflow training. + +Here's an example of ```TF_CONFIG``` + +``` +{ + "cluster":{ + "master":[ + ":" + ], + "ps":[ + ":", + ":", + ":" + ... + ], + "worker":[ + ":", + ":", + ":" + ... + ] + }, + "task":{ + "type": "worker", + "index": 0 + }, + "environment":"cloud" +} +``` + +It includes two parts, the first is ```cluster```. ```cluster``` is a collection of endpoints of all roles of a Tensorflow job. Roles include: + +- ```ps```: saves the parameters among all workers. All workers can read/write/update the parameters for model via ps. As some models are extremely large the parameters are shared among the ps (each ps stores a subset). +- ```worker```: does the training. +- ```master```: basically a special worker, it does training, but also restores and saves checkpoints and do evaluation. + +```cluster``` part is identical to all roles of a given Tensorflow job. + +(Description of these roles copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) + +The second is ```task```, which describes role of the launched process, which is different for different roles. For example, if ```task``` is specified to: +``` + "task":{ + "type": "worker", + "index": 0 + } +``` +The launched instance will use: ```:``` as endpoint. + +Following script can be used to generate ```TF_CONFIG```: + +``` +import sys +def get_component_array(name, count, hostname_suffix): + component = "\\\\" + '\\"' + name + "\\\\" + '\\":' + component_names = '[' + for i in xrange(0, count): + component_names = component_names + "\\\\" + '\\' + '"' + name + "-" + str(i) + hostname_suffix + "\\\\" + '\\"' + if i != count - 1: + component_names = component_names + ',' + component_names = component_names + ']' + return component + component_names +def get_key_value_pair(name, keys, values, count): + block_name = "\\\\" + '\\"' + name + "\\\\" + '\\":' + block_values = '' + if count == 1: + block_values = block_values + '\\' + "\\\\" + '"' + values[0] + "\\\\" + '\\"' + return block_name + block_values + block_values = '{' + for i in xrange(0, count): + block_values = block_values + "\\\\" + '\\' + '"' + keys[i] + "\\\\" + '\\"' + ':' + values[i] + if i != count - 1: + block_values = block_values + ',' + block_values = block_values + '}' + return block_name + block_values +# Generate TF_CONFIG from username and domain name. Use this to create an ENV variable which could be used as env in native service spec. +if len (sys.argv) != 6 : + print "Usage: python generate_tf_config.py " + sys.exit (1) +username = sys.argv[1] +domain = sys.argv[2] +servicename = sys.argv[3] +num_worker = int(sys.argv[4]) +num_ps = int(sys.argv[5]) +hostname_suffix = "." + servicename + "."+ username + "." + domain + ":8000" +cluster = '"{' + "\\\\" + '\\"cluster' + "\\\\" + '\\":{' +master = get_component_array("master", 1, hostname_suffix) + "," +ps = get_component_array("ps", num_ps, hostname_suffix) + "," +worker = get_component_array("worker", num_worker, hostname_suffix) + "}," +component_name = "\\\\" + '\\"' + "${COMPONENT_NAME}" + "\\\\" + '\\"' +component_id = "${COMPONENT_ID}" +task = get_key_value_pair("task", ["type", "index"], [component_name, component_id], 2) + "," +env = get_key_value_pair("environment", "", ["cloud"], 1) + '}"' +print '"{}"'.format("TF_CONFIG"), ":" , cluster, master, ps, worker, task, env +``` + +Running + +``` +python path/to/saved/python-file example.com distributed-tf 10 3 +``` + +Generates ```TF_CONFIG``` for given user_name, domain name at example.com (which is same as ```hadoop.registry.dns.domain-name``` in ```yarn-site.xml```), service name is ```distributed-tf```. 10 workers (exclude master), and 3 parameter servers. The python script can be tailored according to your cluster environment. + +#### Yarnfile +``` +{ + "name": "distributed-tf", + "version": "1.0.0", + "components": [ + { + "name": "master", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=0 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1, + "run_privileged_container": false + }, + { + "name": "worker", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "4096" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --train-steps=10000 --num-gpus=0 --eval-batch-size=16 --train-batch-size=16 --sync", + "number_of_containers": 1, + "run_privileged_container": false + }, + { + "name": "ps", + "dependencies": [], + "resource": { + "cpus": 1, + "memory": "2048" + }, + "artifact" : { + "id" : , + "type" : "DOCKER" + }, + "launch_command": "export HADOOP_HDFS_HOME=/hadoop-3.1.0; export HADOOP_HOME=; export HADOOP_YARN_HOME=; export CLASSPATH=\\`\\$HADOOP_HDFS_HOME/bin/hadoop classpath --glob\\`; export LD_LIBRARY_PATH=\\$LD_LIBRARY_PATH:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/; cd /test/cifar10_estimator && python cifar10_main.py --data-dir=hdfs://default/tmp/cifar-10-data --job-dir=hdfs://default/tmp/cifar-10-jobdir --num-gpus=0", + "number_of_containers": 1, + "run_privileged_container": false + } + ], + "configuration": { + "properties": {}, + "env": { + "TF_CONFIG" : , + "HADOOP_CONF_DIR" : "/etc/hadoop/conf", + "JAVA_HOME" : "/usr/lib/jvm/java-8-openjdk-amd64/jre/", + "YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK": "bridge" + } + } +} +``` diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md new file mode 100644 index 00000000000..a5c27f38b82 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/markdown/TensorflowOnYarnTutorial.md @@ -0,0 +1,137 @@ + +# Tutorial: Running Tensorflow on YARN + +## Prepare data for training + +CIFAR-10 is a common benchmark in machine learning for image recognition. Below example is based on CIFAR-10 dataset. + +1) Checkout https://github.com/tensorflow/models/: +``` +git clone https://github.com/tensorflow/models/ +``` + +2) Go to `models/tutorials/image/cifar10_estimator` + +3) Generate data by using following command: (required Tensorflow installed) + +``` +python generate_cifar10_tfrecords.py --data-dir=/tmp/cifar-10-data +``` + +4) Upload data to HDFS + +``` +hadoop fs -put /tmp/cifar-10-data/ /tmp/ +``` + +**Please note that:** + +a. All following examples are using ```/tmp/cifar-10-jobdir``` as snapshot directory for training, so suggest to run: +``` +hadoop fs -rmr /tmp/cifar-10-jobdir +``` +to cleanup snapshot between runs. + +b. YARN service doesn't allow multiple services with the same name, so please run following command +``` +yarn application -destroy +``` +to delete services if you want to reuse the same service name. + +## Run Tensorflow jobs + +### Use auto generated Yarnfile and run job + +With the help of custom python scripts, user can run Tensorflow jobs easily by auto creating Yarnfile. + +Please refer to [Run Tensorflow Job Using Helper Script](RunTensorflowJobUsingHelperScript.html) for more details. This is also recommended approach. + +### Use raw YARN service spec to run job + +This is useful if you want to understand things happening behind-the-scene. But this is not recommended for end-user to use. + +Please refer to [Run Tensorflow Job Using Raw Native Service Spec](RunTensorflowJobUsingNativeServiceSpec.html) for more details. + +## Outputs + +Sample output of master: +``` +... +allow_soft_placement: true +, '_tf_random_seed': None, '_task_type': u'master', '_environment': u'cloud', '_is_chief': True, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:29:14.656022: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> localhost:8000} +2018-05-06 22:29:14.656097: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.656112: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.659359: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:Restoring parameters from hdfs://default/tmp/cifar-10-jobdir/model.ckpt-0 +INFO:tensorflow:Evaluation [1/625] +INFO:tensorflow:Evaluation [2/625] +INFO:tensorflow:Evaluation [3/625] +INFO:tensorflow:Evaluation [4/625] +INFO:tensorflow:Evaluation [5/625] +INFO:tensorflow:Evaluation [6/625] +... +INFO:tensorflow:Validation (step 1): loss = 1220.6445, global_step = 1, accuracy = 0.1 +INFO:tensorflow:loss = 6.3980675, step = 0 +INFO:tensorflow:loss = 6.3980675, learning_rate = 0.1 +INFO:tensorflow:global_step/sec: 2.34092 +INFO:tensorflow:Average examples/sec: 1931.22 (1931.22), step = 100 +INFO:tensorflow:Average examples/sec: 354.236 (38.6479), step = 110 +INFO:tensorflow:Average examples/sec: 211.096 (38.7693), step = 120 +INFO:tensorflow:Average examples/sec: 156.533 (38.1633), step = 130 +INFO:tensorflow:Average examples/sec: 128.6 (38.7372), step = 140 +INFO:tensorflow:Average examples/sec: 111.533 (39.0239), step = 150 +``` + +Sample output of worker: +``` +, '_tf_random_seed': None, '_task_type': u'worker', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:45.807936: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808040: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808064: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:8000} +2018-05-06 22:28:45.809919: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:loss = 5.319096, step = 0 +INFO:tensorflow:loss = 5.319096, learning_rate = 0.1 +INFO:tensorflow:Average examples/sec: 49.2338 (49.2338), step = 10 +INFO:tensorflow:Average examples/sec: 52.117 (55.3589), step = 20 +INFO:tensorflow:Average examples/sec: 53.2754 (55.7541), step = 30 +INFO:tensorflow:Average examples/sec: 53.8388 (55.6028), step = 40 +INFO:tensorflow:Average examples/sec: 54.1082 (55.2134), step = 50 +INFO:tensorflow:Average examples/sec: 54.3141 (55.3676), step = 60 +``` + +Sample output of ps: +``` +... +, '_tf_random_seed': None, '_task_type': u'ps', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': , '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:42.562316: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.562408: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} +2018-05-06 22:28:42.562433: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.564242: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +``` \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml new file mode 100644 index 00000000000..a52d1638b55 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-deep-learning-frameworks/src/site/site.xml @@ -0,0 +1,28 @@ + + + + + org.apache.maven.skins + maven-stylus-skin + ${maven-stylus-skin.version} + + + + + + + + + \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml index 490e9ad5b93..71b2bc3ae7d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml @@ -37,6 +37,7 @@ hadoop-yarn-applications-distributedshell hadoop-yarn-applications-unmanaged-am-launcher hadoop-yarn-services + hadoop-yarn-deep-learning-frameworks