# This docker file is following instructions from the official deepspeech docker.train.tmpl # available here https://github.com/mozilla/DeepSpeech/blob/master/Dockerfile.train.tmpl FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/tensorflow-training:1.15.4-gpu-py37-cu100-ubuntu18.04 ENV MOZILLA_VOICE_STT_REPO="https://github.com/mozilla/STT" ENV MOZILLA_VOICE_STT_SHA="origin/master" RUN apt-get update && apt-get install -y --no-install-recommends\ apt-utils \ bash-completion \ build-essential \ cmake \ curl \ git \ libboost-all-dev \ libbz2-dev \ locales \ python3-venv \ unzip \ wget RUN apt-get install -y sox libsox-fmt-mp3 RUN apt-get purge -y python3-xdg # Install dependencies for audio augmentation RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 # freeing some space RUN rm -rf /var/lib/apt/lists/*; mkdir /opt/ml/ #Clone deepspeech library WORKDIR / RUN echo git clone $MOZILLA_VOICE_STT_REPO RUN git clone $MOZILLA_VOICE_STT_REPO RUN mv STT /opt/ml/code #Setting the working directory of SageMaker WORKDIR /opt/ml/code RUN echo git checkout $MOZILLA_VOICE_STT_SHA RUN git checkout $MOZILLA_VOICE_STT_SHA # Build CTC decoder RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl # Prepare deps RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 # Install Mozilla Voice STT # - No need for the decoder since we did it earlier # - There is already correct TensorFlow GPU installed on the base image, RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . # Tool to convert output graph for inference RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ --artifact convert_graphdef_memmapped_format --target . # Build KenLM to generate new scorers WORKDIR /opt/ml/code/native_client RUN rm -rf kenlm && \ git clone https://github.com/kpu/kenlm && \ cd kenlm && \ git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \ mkdir -p build && \ cd build && \ cmake .. && \ make -j $(nproc) RUN rm -rf /opt/ml/code/requirement* && rm -rf /opt/ml/code/setup.py COPY start.py /opt/ml/code ENV PATH="/opt/ml/code:${PATH}" WORKDIR /opt/ml/code/ # this environment variable is used by the SageMaker PyTorch container to determine our user code directory. ENV TF_CUDNN_RESET_RND_GEN_STATE 1 ENV TF_FORCE_GPU_ALLOW_GROWTH True ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code ENV SAGEMAKER_PROGRAM start.py