#!/bin/bash path="$(cd "$(dirname $0)";pwd)" cd ${path} export HADOOP_CLASSPATH=$(hadoop classpath) function usage { echo "Usage: tpcds-setup.sh scale_factor fs db suffix mode" exit 1 } function runcommand { echo $1 $1 } # Get the parameters. SCALE=$1 FS=$2 DB_SUFFIX=$3 MODE=$4 if [[ "X$DEBUG_SCRIPT" != "X" ]]; then set -x fi # Sanity checking. if [[ X"$SCALE" = "X" ]]; then usage fi if [[ X"$FS" = "X" ]]; then FS=hdfs:// fi if [[ X"$DB_SUFFIX" = "X" ]]; then DB_SUFFIX=`echo ${FS} | grep -o -P '[a-z0-9]+(?=:)'` fi DIR=${FS}/tmp/tpcds-generate if [[ ${SCALE} -eq 1 ]]; then echo "Scale factor must be greater than 1" exit 1 fi sudo /usr/lib/spark/sbin/stop-thriftserver.sh # Do the actual data load. hadoop fs -mkdir -p ${DIR} hadoop fs -ls ${DIR}/${SCALE} > /dev/null if [[ $? -ne 0 ]]; then echo "Generating data at scale factor $SCALE." (hadoop jar target/tpcds-gen-1.0-SNAPSHOT.jar -d ${DIR}/${SCALE}/ -s ${SCALE}) fi hadoop fs -ls ${DIR}/${SCALE} > /dev/null if [[ $? -ne 0 ]]; then echo "Data generation failed, exiting." exit 1 fi hadoop fs -chmod -R 777 ${DIR}/${SCALE} echo "TPC-DS text data generation complete." SPARK_SQL="spark-sql" if [[ "X$SPARK_HOME" != "X" ]]; then SPARK_SQL="$SPARK_HOME/bin/spark-sql" fi CORE_NODES=$(yarn node -list -showDetails | grep -i 'Total Nodes' | sed -r 's/.*:([[:digit:]]+).*/\1/') VCORES=$(yarn node -list -showDetails | grep "Configured Resources" | head -1 | sed -r 's/.*vCores:([[:digit:]]+).*/\1/') TOTAL_MEM=$(yarn node -list -showDetails | grep "Configured Resources" | head -1 | sed -r 's/.*memory:([[:digit:]]+).*/\1/') EXEC_MEM=$(printf "%.0f" $(bc <<< "${TOTAL_MEM} * 0.8")) echo "core nodes: ${CORE_NODES}, vcores: ${VCORES}, total mem: ${TOTAL_MEM}m, exec mem: ${EXEC_MEM}m" SPARK_SQL="${SPARK_SQL} --master yarn --executor-memory ${EXEC_MEM}m --executor-cores ${VCORES} --num-executors ${CORE_NODES} --hiveconf hive.optimize.sort.dynamic.partition=true --hiveconf hive.exec.max.dynamic.partitions=100000 --hiveconf hive.exec.max.dynamic.partitions.pernode=3000 --hiveconf hive.exec.dynamic.partition.mode=nonstrict --conf spark.sql.parquet.writeLegacyFormat=true " hive -e "show databases;" >/dev/null 2>&1 if [[ $? -ne 0 ]]; then echo "Failed to connect hive." exit 1 fi set -e # Create the text/flat tables as external tables. These will be later be converted to Parquet. echo "Create external text file tables:" runcommand "hive -f ${path}/scripts/external.sql --hivevar db=tpcds_text_${SCALE}_${DB_SUFFIX} --hivevar location=${DIR}/${SCALE}" echo "Create Parquet tables:" runcommand "hive -f ${path}/scripts/parquet.sql --hivevar db_parquet=tpcds_parquet_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX} --hivevar location=${FS}/tmp/tpcds-parquet/${SCALE}" # echo "Create Parquet no partition tables:" # runcommand "hive -f ${path}/scripts/parquet_no_partition.sql --hivevar db_parquet=tpcds_parquet_no_partition_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX} --hivevar location=${FS}/tmp/tpcds-parquet-no-partition/${SCALE}" echo "Create orc tables:" runcommand "hive -f ${path}/scripts/orc.sql --hivevar db_orc=tpcds_orc_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX} --hivevar location=${FS}/tmp/tpcds-orc/${SCALE}" # echo "Create orc no partition tables:" # runcommand "hive -f ${path}/scripts/orc_no_partition.sql --hivevar db_orc=tpcds_orc_no_partition_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX} --hivevar location=${FS}/tmp/tpcds-orc-no-partition/${SCALE}" if [[ "$MODE" = "LOAD" ]]; then echo "Load Parquet tables:" runcommand "$SPARK_SQL -f ${path}/scripts/insert.sql --hivevar db_parquet=tpcds_parquet_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX}" # echo "Load Parquet no partition tables:" # runcommand "$SPARK_SQL -f ${path}/scripts/insert_parquet_no_partition.sql --hivevar db_parquet=tpcds_parquet_no_partition_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX}" echo "Load orc tables:" runcommand "$SPARK_SQL -f ${path}/scripts/insert_orc.sql --hivevar db_orc=tpcds_orc_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX}" # echo "Load orc no partition tables:" # runcommand "$SPARK_SQL -f ${path}/scripts/insert_orc_no_partition.sql --hivevar db_orc=tpcds_orc_no_partition_${SCALE}_${DB_SUFFIX} --hivevar db_txt=tpcds_text_${SCALE}_${DB_SUFFIX}" else echo "Repair parquet tables:" hive --database tpcds_parquet_${SCALE}_${DB_SUFFIX} -e "msck repair table catalog_returns; msck repair table catalog_sales; msck repair table inventory; msck repair table store_returns; msck repair table store_sales; msck repair table web_returns; msck repair table web_sales;" echo "Repair orc tables:" hive --database tpcds_orc_${SCALE}_${DB_SUFFIX} -e "msck repair table catalog_returns; msck repair table catalog_sales; msck repair table inventory; msck repair table store_returns; msck repair table store_sales; msck repair table web_returns; msck repair table web_sales;" fi