From f7b006691844d0db68ddca6f40f0e4d39dfe3a55 Mon Sep 17 00:00:00 2001 From: Manthan Thakker Date: Thu, 13 Aug 2020 16:56:12 -0700 Subject: [PATCH] docker images for spark3.0 Docker Images to deploy a AZTK Batch Account Cluster with Spark3.0 --- .../anaconda/spark3.0.0/base/Dockerfile | 155 ++++++++++++++++++ docker-image/base/spark3.0.0/Dockerfile | 127 ++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 docker-image/anaconda/spark3.0.0/base/Dockerfile create mode 100644 docker-image/base/spark3.0.0/Dockerfile diff --git a/docker-image/anaconda/spark3.0.0/base/Dockerfile b/docker-image/anaconda/spark3.0.0/base/Dockerfile new file mode 100644 index 00000000..8dc53b7a --- /dev/null +++ b/docker-image/anaconda/spark3.0.0/base/Dockerfile @@ -0,0 +1,155 @@ +## TO DO AztkTeam: Publish The base image to official aztk dockerhub and pull the published image +######################################################################################## +# Ubuntu 16.04 (Xenial) +FROM ubuntu:16.04 + +# set AZTK version compatibility +ENV AZTK_DOCKER_IMAGE_VERSION 0.1.0 + +# set version of python required for aztk +ENV AZTK_PYTHON_VERSION=3.5.2 + +# modify these ARGs on build time to specify your desired versions of Spark/Hadoop +ENV SPARK_VERSION_KEY 3.0.0 +ENV SPARK_FULL_VERSION spark-${SPARK_VERSION_KEY}-bin-without-hadoop +ENV HADOOP_VERSION 2.7.4 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +# set env vars +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 +ENV SPARK_HOME /home/spark-current +ENV PATH $SPARK_HOME/bin:$PATH + +RUN apt-get clean \ + && apt-get update -y \ + # install dependency packages + && apt-get install -y --no-install-recommends \ + make \ + build-essential \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + maven \ + wget \ + curl \ + llvm \ + git \ + libncurses5-dev \ + libncursesw5-dev \ + python3-pip \ + python3-venv \ + xz-utils \ + tk-dev \ + && apt-get update -y \ + && apt-get autoclean \ + # install [software-properties-common] + # so we can use [apt-add-repository] to add the repository [ppa:webupd8team/java] + # from which we install Java8 + && apt-get install -y --no-install-recommends software-properties-common \ + && apt-add-repository ppa:webupd8team/java -y \ + && apt-get update -y + # install java +RUN apt-get install -y --no-install-recommends default-jdk + # set up user python and aztk python +RUN ln -s /usr/bin/python3.5 /usr/bin/python \ + && /usr/bin/python -m pip install --upgrade pip setuptools wheel \ + && apt-get remove -y python3-pip + +# build and install spark +RUN git clone https://github.com/apache/spark.git \ + && cd spark \ + && git checkout tags/v${SPARK_VERSION_KEY} \ + && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ + && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ + && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ + && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ + && rm -rf /spark + +# copy azure storage jars and dependencies to $SPARK_HOME/jars +RUN echo "" \ + "4.0.0" \ + "groupId" \ + "artifactId" \ + "1.0" \ + "" \ + "" \ + "org.apache.hadoop" \ + "hadoop-azure" \ + "${HADOOP_VERSION}" \ + "" \ + "" \ + "org.apache.hadoop" \ + "hadoop-common" \ + "" \ + "" \ + "com.fasterxml.jackson.core" \ + "jackson-core" \ + "" \ + " " \ + "" \ + "" \ + "com.microsoft.sqlserver" \ + "mssql-jdbc" \ + "6.4.0.jre8" \ + "" \ + "" \ + "com.microsoft.azure" \ + "azure-storage" \ + "2.2.0" \ + "" \ + "" \ + "com.fasterxml.jackson.core" \ + "jackson-core" \ + "" \ + "" \ + "org.apache.commons" \ + "commons-lang3" \ + "" \ + "" \ + "org.slf4j" \ + "slf4j-api" \ + "" \ + "" \ + "" \ + "" \ + "" > /tmp/pom.xml \ + && cd /tmp \ + && mvn dependency:copy-dependencies -DoutputDirectory="${SPARK_HOME}/jars/" + # cleanup + && apt-get --purge autoremove -y maven python3-pip \ + && apt-get autoremove -y \ + && apt-get autoclean -y \ + && rm -rf /tmp/* \ + && rm -rf /root/.cache \ + && rm -rf /root/.m2 \ + && rm -rf /var/lib/apt/lists/* + +CMD ["/bin/bash"] +######################################################################################## + +## Install Anaconda & Jupyter + +ARG ANACONDA_VERSION=Anaconda3-5.1.0 + +ENV PATH /opt/conda/bin:$PATH +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ + libglib2.0-0 libxext6 libsm6 libxrender1 \ + git mercurial subversion \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ + && /bin/bash ~/anaconda.sh -b -p /opt/conda \ + && rm ~/anaconda.sh \ + && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ + && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ + # reset default python to 3.5 + && rm /usr/bin/python \ + && ln -s /usr/bin/python3.5 /usr/bin/python + +CMD ["/bin/bash"] + diff --git a/docker-image/base/spark3.0.0/Dockerfile b/docker-image/base/spark3.0.0/Dockerfile new file mode 100644 index 00000000..6a756549 --- /dev/null +++ b/docker-image/base/spark3.0.0/Dockerfile @@ -0,0 +1,127 @@ +# Ubuntu 16.04 (Xenial) +FROM ubuntu:16.04 + +# set AZTK version compatibility +ENV AZTK_DOCKER_IMAGE_VERSION 0.1.0 + +# set version of python required for aztk +ENV AZTK_PYTHON_VERSION=3.5.2 + +# modify these ARGs on build time to specify your desired versions of Spark/Hadoop +ENV SPARK_VERSION_KEY 3.0.0 +ENV SPARK_FULL_VERSION spark-${SPARK_VERSION_KEY}-bin-without-hadoop +ENV HADOOP_VERSION 2.7.4 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +# set env vars +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 +ENV SPARK_HOME /home/spark-current +ENV PATH $SPARK_HOME/bin:$PATH + +RUN apt-get clean \ + && apt-get update -y \ + # install dependency packages + && apt-get install -y --no-install-recommends \ + make \ + build-essential \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + maven \ + wget \ + curl \ + llvm \ + git \ + libncurses5-dev \ + libncursesw5-dev \ + python3-pip \ + python3-venv \ + xz-utils \ + tk-dev \ + && apt-get update -y \ + && apt-get autoclean \ + # install [software-properties-common] + # so we can use [apt-add-repository] to add the repository [ppa:webupd8team/java] + # from which we install Java8 + && apt-get install -y --no-install-recommends software-properties-common \ + && apt-add-repository ppa:webupd8team/java -y \ + && apt-get update -y + # install java +RUN apt-get install -y --no-install-recommends default-jdk + # set up user python and aztk python +RUN ln -s /usr/bin/python3.5 /usr/bin/python \ + && /usr/bin/python -m pip install --upgrade pip setuptools wheel \ + && apt-get remove -y python3-pip + +# build and install spark +RUN git clone https://github.com/apache/spark.git \ + && cd spark \ + && git checkout tags/v${SPARK_VERSION_KEY} \ + && export MAVEN_OPTS="-Xmx3g -XX:ReservedCodeCacheSize=1024m" \ + && ./dev/make-distribution.sh --name custom-spark --pip --tgz -Pnetlib-lgpl -Phive -Phive-thriftserver -Dhadoop.version=${HADOOP_VERSION} -DskipTests \ + && tar -xvzf /spark/spark-${SPARK_VERSION_KEY}-bin-custom-spark.tgz --directory=/home \ + && ln -s "/home/spark-${SPARK_VERSION_KEY}-bin-custom-spark" /home/spark-current \ + && rm -rf /spark + +# copy azure storage jars and dependencies to $SPARK_HOME/jars +RUN echo "" \ + "4.0.0" \ + "groupId" \ + "artifactId" \ + "1.0" \ + "" \ + "" \ + "org.apache.hadoop" \ + "hadoop-azure" \ + "${HADOOP_VERSION}" \ + "" \ + "" \ + "org.apache.hadoop" \ + "hadoop-common" \ + "" \ + "" \ + "com.fasterxml.jackson.core" \ + "jackson-core" \ + "" \ + " " \ + "" \ + "" \ + "com.microsoft.sqlserver" \ + "mssql-jdbc" \ + "6.4.0.jre8" \ + "" \ + "" \ + "com.microsoft.azure" \ + "azure-storage" \ + "2.2.0" \ + "" \ + "" \ + "com.fasterxml.jackson.core" \ + "jackson-core" \ + "" \ + "" \ + "org.apache.commons" \ + "commons-lang3" \ + "" \ + "" \ + "org.slf4j" \ + "slf4j-api" \ + "" \ + "" \ + "" \ + "" \ + "" > /tmp/pom.xml \ + && cd /tmp \ + && mvn dependency:copy-dependencies -DoutputDirectory="${SPARK_HOME}/jars/" + # cleanup + && apt-get --purge autoremove -y maven python3-pip \ + && apt-get autoremove -y \ + && apt-get autoclean -y \ + && rm -rf /tmp/* \ + && rm -rf /root/.cache \ + && rm -rf /root/.m2 \ + && rm -rf /var/lib/apt/lists/* + +CMD ["/bin/bash"]