{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Spark on Kubernetes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Preparing the notebook https://towardsdatascience.com/make-kubeflow-into-your-own-data-science-workspace-cc8162969e29" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run Spark inside Notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://www.sicara.ai/blog/2017-05-02-get-started-pyspark-jupyter-notebook-3-minutes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import findspark\n", "findspark.init()\n", "\n", "import pyspark\n", "import random\n", "\n", "sc = pyspark.SparkContext(appName=\"Pi\")\n", "sc.setLogLevel(\"INFO\")\n", "num_samples = 100000\n", "\n", "def inside(p): \n", " x, y = random.random(), random.random()\n", " return x*x + y*y < 1\n", "\n", "count = sc.parallelize(range(0, num_samples)).filter(inside).count()\n", "\n", "pi = 4 * count / num_samples\n", "print(pi)\n", "\n", "sc.stop()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup service account permissions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://github.com/kubeflow/kubeflow/issues/4306 issue with launching spark-operator from jupyter notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run command in your shell (not in notebook)\n", "\n", "```shell\n", "export NAMESPACE=\n", "kubectl create serviceaccount spark -n ${NAMESPACE}\n", "kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=${NAMESPACE}:spark --namespace=${NAMESPACE}\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Python version\n", "\n", "> Note: Make sure your driver python and executor python version matches.\n", "> Otherwise, you will see error msg like below\n", "\n", "Exception: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables `PYSPARK_PYTHON` and `PYSPARK_DRIVER_PYTHON` are correctly set." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "print(sys.version)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Client Mode" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import findspark, pyspark,socket\n", "from pyspark import SparkContext, SparkConf\n", "from pyspark.sql import SparkSession\n", "\n", "findspark.init()\n", "\n", "localIpAddress = socket.gethostbyname(socket.gethostname())\n", "\n", "conf = SparkConf().setAppName('sparktest1')\n", "conf.setMaster('k8s://https://kubernetes.default.svc:443')\n", "conf.set(\"spark.submit.deployMode\", \"client\")\n", "conf.set(\"spark.executor.instances\", \"2\")\n", "conf.set(\"spark.driver.host\", localIpAddress)\n", "conf.set(\"spark.driver.port\", \"7778\")\n", "conf.set(\"spark.kubernetes.namespace\", \"kf10\")\n", "conf.set(\"spark.kubernetes.container.image\", \"seedjeffwan/spark-py:v3.0.0\")\n", "conf.set(\"spark.kubernetes.pyspark.pythonVersion\", \"3\")\n", "conf.set(\"spark.kubernetes.namespace\", \"spark\")\n", "conf.set(\"spark.kubernetes.authenticate.driver.serviceAccountName\", \"spark\")\n", "conf.set(\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\", \"false\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sc = pyspark.context.SparkContext.getOrCreate(conf=conf)\n", "\n", "# following works as well\n", "# spark = SparkSession.builder.config(conf=conf).getOrCreate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_samples = 100000\n", "\n", "def inside(p): \n", " x, y = random.random(), random.random()\n", " return x*x + y*y < 1\n", "\n", "count = sc.parallelize(range(0, num_samples)).filter(inside).count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cluster Mode" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Java" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "\n", "/opt/spark/bin/spark-submit --master \"k8s://https://kubernetes.default.svc:443\" \\\n", "--deploy-mode cluster \\\n", "--name spark-pi-java \\\n", "--class org.apache.spark.examples.SparkPi \\\n", "--conf spark.executor.instances=2 \\\n", "--conf spark.kubernetes.namespace=spark \\\n", "--conf spark.kubernetes.driver.annotation.sidecar.istio.io/inject=false \\\n", "--conf spark.kubernetes.executor.annotation.sidecar.istio.io/inject=false \\\n", "--conf spark.kubernetes.container.image=seedjeffwan/spark:v3.0.0 \\\n", "--conf spark.kubernetes.driver.pod.name=spark-pi-java \\\n", "--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \\\n", "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.6.jar 1000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "\n", "/opt/spark/bin/spark-submit --master \"k8s://https://kubernetes.default.svc:443\" \\\n", "--deploy-mode cluster \\\n", "--name spark-pi \\\n", "--conf spark.executor.instances=2 \\\n", "--conf spark.kubernetes.container.image=seedjeffwan/spark-py:v3.0.0 \\\n", "--conf spark.kubernetes.driver.pod.name=spark-pi \\\n", "--conf spark.kubernetes.namespace=spark \\\n", "--conf spark.kubernetes.driver.annotation.sidecar.istio.io/inject=false \\\n", "--conf spark.kubernetes.executor.annotation.sidecar.istio.io/inject=false \\\n", "--conf spark.kubernetes.pyspark.pythonVersion=3 \\\n", "--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark /opt/spark/examples/src/main/python/pi.py 1000" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }