# // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # // SPDX-License-Identifier: MIT-0 ARG SPARK_BASE_IMAGE=ghcr.io/datapunchorg/spark:spark-3.2.1-1643336295 FROM amazonlinux:2 as tpc-toolkit # compile tpcds kit RUN yum update -y && \ yum group install -y "Development Tools" \ && git clone https://github.com/databricks/tpcds-kit.git -b master /tmp/tpcds-kit \ && cd /tmp/tpcds-kit/tools \ && make OS=LINUX FROM mozilla/sbt:8u292_1.5.4 as sbt ARG SPARK_VERSION=3.2.0 # Build the Databricks SQL perf library RUN git clone -b spark-${SPARK_VERSION} --single-branch https://github.com/aws-samples/emr-on-eks-benchmark.git /tmp/emr-on-eks-benchmark \ && cd /tmp/emr-on-eks-benchmark/spark-sql-perf/ \ && sbt +package # Use the compiled Databricks SQL perf library to build benchmark utility RUN cd /tmp/emr-on-eks-benchmark/ && mkdir /tmp/emr-on-eks-benchmark/benchmark/libs \ && cp /tmp/emr-on-eks-benchmark/spark-sql-perf/target/scala-2.12/*.jar /tmp/emr-on-eks-benchmark/benchmark/libs \ && cd /tmp/emr-on-eks-benchmark/benchmark && sbt assembly FROM ${SPARK_BASE_IMAGE} ARG MAVEN_VERSION=3.8.6 ARG BASE_URL=https://apache.osuosl.org/maven/maven-3/${MAVEN_VERSION}/binaries USER root COPY --from=tpc-toolkit /tmp/tpcds-kit/tools /opt/tpcds-kit/tools COPY --from=sbt /tmp/emr-on-eks-benchmark/benchmark/target/scala-2.12/*jar ${SPARK_HOME}/examples/jars/ # # upgrade JDK to corretto11, still not compatible with EMR on EKS # RUN apt-get update && apt-get install -y curl git # && curl -fsSL -o /usr/lib/amazon-corretto-11-x64-linux-jdk.tar.gz https://corretto.aws/downloads/latest/amazon-corretto-11-x64-linux-jdk.tar.gz \ # && tar -xzf /usr/lib/amazon-corretto-11-x64-linux-jdk.tar.gz \ # && rm -f /usr/lib/amazon-corretto-11-x64-linux-jdk.tar.gz # RUN update-alternatives --install "/usr/bin/java" "java" "/usr/lib/amazon-corretto-11.0.16.9.1-linux-x64/bin/java" 2000 # ENV JAVA_HOME /usr/lib/java # ENV PATH $PATH:$JAVA_HOME/bin # install maven RUN apt-get update && apt-get install -y curl git \ && mkdir -p /usr/share/maven /usr/share/maven/ref \ && curl -fsSL -o /tmp/apache-maven.tar.gz ${BASE_URL}/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ && tar -xzf /tmp/apache-maven.tar.gz -C /usr/share/maven --strip-components=1 \ && rm -f /tmp/apache-maven.tar.gz \ && ln -s /usr/share/maven/bin/mvn /usr/bin/mvn ENV MAVEN_HOME /usr/share/maven ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2" # compile RSS client RUN git clone -b k8s-spark-3.2 --single-branch https://github.com/datapunchorg/RemoteShuffleService.git /tmp/RemoteShuffleService WORKDIR /tmp/RemoteShuffleService RUN mvn clean package -Pclient -DskipTests -Dmaven.javadoc.skip=true \ && rm target/original-remote-shuffle-service-*.jar \ && rm target/remote-shuffle-service-*-sources.jar \ && mv target/remote-shuffle-service-client-*.jar ${SPARK_HOME}/jars/ WORKDIR /home/hadoop # Use hadoop user and group USER hadoop:hadoop