FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2 ARG REGION ENV AWS_REGION ${REGION} RUN yum clean all \ && yum update -y \ && yum install -y awscli bigtop-utils curl gcc gzip unzip zip gunzip tar wget liblapack* libblas* libopencv* libopenblas* # Install python 3.9 ARG PYTHON_BASE_VERSION=3.9 ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION} ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION} ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.12 RUN yum -y groupinstall 'Development Tools' \ && yum -y install openssl-devel bzip2-devel libffi-devel sqlite-devel xz-devel \ && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ && tar xzf Python-${PYTHON_VERSION}.tgz \ && cd Python-*/ \ && ./configure --enable-optimizations \ && make altinstall \ && echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \ && ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \ && ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \ && cd .. \ && rm Python-${PYTHON_VERSION}.tgz \ && rm -rf Python-${PYTHON_VERSION} # install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm RUN yum install -y epel-release-latest-7.noarch.rpm RUN yum install -y nginx RUN rm -rf /var/cache/yum ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed ENV PYTHONHASHSEED 0 ENV PYTHONIOENCODING UTF-8 ENV PIP_DISABLE_PIP_VERSION_CHECK 1 # Install EMR Spark/Hadoop ENV HADOOP_HOME /usr/lib/hadoop ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop ENV SPARK_HOME /usr/lib/spark COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo # Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations. # replace placeholder with region in repository URL RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo RUN adduser -N hadoop # These packages are a subset of what EMR installs in a cluster with the # "hadoop", "spark", and "hive" applications. # They include EMR-optimized libraries and extras. RUN yum install -y aws-hm-client \ aws-java-sdk \ aws-sagemaker-spark-sdk \ emr-goodies \ emr-ruby \ emr-scripts \ emr-s3-select \ emrfs \ hadoop \ hadoop-client \ hadoop-hdfs \ hadoop-hdfs-datanode \ hadoop-hdfs-namenode \ hadoop-httpfs \ hadoop-kms \ hadoop-lzo \ hadoop-yarn \ hadoop-yarn-nodemanager \ hadoop-yarn-proxyserver \ hadoop-yarn-resourcemanager \ hadoop-yarn-timelineserver \ hive \ hive-hcatalog \ hive-hcatalog-server \ hive-jdbc \ hive-server2 \ s3-dist-cp \ spark-core \ spark-datanucleus \ spark-external \ spark-history-server \ spark-python # Point Spark at proper python binary ENV PYSPARK_PYTHON=/usr/local/bin/python3.9 # Setup Spark/Yarn/HDFS user as root ENV PATH="/usr/bin:/opt/program:${PATH}" ENV YARN_RESOURCEMANAGER_USER="root" ENV YARN_NODEMANAGER_USER="root" ENV HDFS_NAMENODE_USER="root" ENV HDFS_DATANODE_USER="root" ENV HDFS_SECONDARYNAMENODE_USER="root" RUN zip -q -d /lib/hive/lib/log4j-core-2.10.0.jar org/apache/logging/log4j/core/lookup/JndiLookup.class # Set up bootstrapping program and Spark configuration COPY hadoop-config /opt/hadoop-config COPY nginx-config /opt/nginx-config COPY aws-config /opt/aws-config COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/ ENV PIPENV_PIPFILE=/opt/program/Pipfile # Use --system flag, so it will install all packages into the system python, # and not into the virtualenv. Since docker containers do not need to have virtualenvs # pipenv > 2022.4.8 fails to build smspark RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \ && pipenv install --system \ && /usr/local/bin/python3.9 -m pip install /opt/program/*.whl # Setup container bootstrapper COPY container-bootstrap-config /opt/container-bootstrap-config RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \ && /opt/container-bootstrap-config/bootstrap.sh # With this config, spark history server will not run as daemon, otherwise there # will be no server running and container will terminate immediately ENV SPARK_NO_DAEMONIZE TRUE WORKDIR $SPARK_HOME # Install the sagemaker feature store spark connector # https://docs.aws.amazon.com/sagemaker/latest/dg/batch-ingestion-spark-connector-setup.html RUN /usr/local/bin/python3.9 -m pip install sagemaker-feature-store-pyspark-3.1==1.1.2 --no-binary :all: ENTRYPOINT ["smspark-submit"]