## Running Spark History Server: 
#### (Note: this feature will only work in a local development environment with docker installed or on a Sagemaker Notebook Instance. This feature does not currently work in SageMaker Studio.)

In [None]:
# import packages
import json
import ast
import sagemaker
from sagemaker.spark.processing import PySparkProcessor
from sagemaker.network import NetworkConfig

sagemaker_session = sagemaker.Session()

with open("../ml_pipeline/params/pipeline_params.json", "r") as f:
 pipeline_params = json.load(f)

# getting pre-process spark ui log s3 output location
process_spark_ui_log_output = pipeline_params["process_spark_ui_log_output"].format(pipeline_params["trial"])

# setting up processing arguments
process_args = [
 "--input_table", pipeline_params["pyspark_process_data_input"],
 "--output_table", pipeline_params["pyspark_process_data_output"]
]
# import spark config used in pipeline run
with open("../src/spark_configuration/configuration.json", "r") as f:
 spark_conf = json.load(f)
spark_conf = json.dumps(spark_conf)

# transforming string into literal 
spark_conf = ast.literal_eval(spark_conf)

# get network configuration
network_config = NetworkConfig (
 encrypt_inter_container_traffic=True,
 security_group_ids=pipeline_params["network_security_group_ids"],
 subnets=pipeline_params["network_subnet_ids"]
)

# Create Spark Processor
spark_processor = PySparkProcessor(
 base_job_name=pipeline_params["pyspark_process_name"],
 framework_version=pipeline_params["pyspark_framework_version"],
 role=pipeline_params["pipeline_role"],
 instance_count=pipeline_params["pyspark_process_instance_count"],
 instance_type=pipeline_params["pyspark_process_instance_type"],
 sagemaker_session=sagemaker_session,
 volume_kms_key=pipeline_params["pyspark_process_volume_kms"],
 output_kms_key=pipeline_params["pyspark_process_output_kms"],
 network_config=network_config,
)
spark_processor.run(
 submit_app=pipeline_params["pyspark_process_code"],
 submit_py_files=[pipeline_params["pyspark_helper_code"]],
 arguments=process_args,
 spark_event_logs_s3_uri=process_spark_ui_log_output,
 logs=False,
 kms_key=pipeline_params["pyspark_process_volume_kms"],
 configuration=spark_conf
)

# Run spark history server to show Spark UI
spark_processor.start_history_server(spark_event_logs_s3_uri=process_spark_ui_log_output)

#### Starting Spark UI
Let's start a history server to visualize your logs

In [None]:
# Run spark history server to show Spark UI
spark_processor.start_history_server(spark_event_logs_s3_uri=process_spark_ui_log_output)

#### The Spark UI output can be visualized at:
https://\.notebook.\.sagemaker.aws/proxy/15050

In [None]:
#TODO: replace values with correct info
notebook_name = "test-spark-ui"
region = "us-east-1"
 
from IPython.core.display import display, HTML

display(
 HTML(
 'Review Spark UI'.format(
 notebook_name, region
 )
 )
)

#### Terminating Spark History Server
Remember to terminate your server once you are ready with your analysis

In [None]:
# Terminate spark history server
spark_processor.terminate_history_server()