package com.amazonaws.emr.titanic;

import org.apache.spark.ml.Model;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.RandomForestClassifier;
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
import org.apache.spark.ml.feature.OneHotEncoder;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.tuning.CrossValidator;
import org.apache.spark.ml.tuning.ParamGridBuilder;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.SparkSession$;
import org.apache.spark.sql.functions$;
import org.apache.spark.sql.types.IntegerType$;
import scala.Array$;
import scala.Predef$;
import scala.StringContext;
import scala.Tuple2;
import scala.collection.mutable.StringBuilder;
import scala.reflect.ClassTag$;
import scala.runtime.BoxesRunTime;

/* compiled from: Titanic.scala */
/* loaded from: input_file:com/amazonaws/emr/titanic/Titanic$.class */
public final class Titanic$ {
    public static final Titanic$ MODULE$ = null;

    static {
        new Titanic$();
    }

    public void main(String[] strArr) {
        if (strArr.length < 2) {
            Predef$.MODULE$.println(new StringBuilder().append("File path must be passed. ").append(BoxesRunTime.boxToInteger(strArr.length)).toString());
            System.exit(-1);
        }
        SparkSession orCreate = SparkSession$.MODULE$.builder().appName("example").getOrCreate();
        orCreate.sparkContext().setLogLevel("ERROR");
        String str = strArr[0];
        String str2 = strArr[1];
        Dataset csv = orCreate.read().option("header", "true").option("inferSchema", "true").csv(str);
        csv.printSchema();
        double d = ((Row) csv.agg(functions$.MODULE$.mean(csv.apply("Age")), Predef$.MODULE$.wrapRefArray(new Column[0])).first()).getDouble(0);
        Dataset[] randomSplit = csv.na().fill(d, new String[]{"Age"}).randomSplit(new double[]{0.7d, 0.3d});
        Dataset<Row> withColumnRenamed = randomSplit[0].withColumnRenamed("Survived", "label");
        Dataset dataset = randomSplit[1];
        PipelineStage[] handleCategorical = handleCategorical("Sex");
        PipelineStage[] handleCategorical2 = handleCategorical("Embarked");
        PipelineStage[] handleCategorical3 = handleCategorical("Pclass");
        VectorAssembler outputCol = new VectorAssembler().setInputCols(new String[]{"Sex_onehot", "Embarked_onehot", "Pclass_onehot", "SibSp", "Parch", "Age", "Fare"}).setOutputCol("features");
        RandomForestClassifier randomForestClassifier = new RandomForestClassifier();
        Pipeline stages = new Pipeline().setStages((PipelineStage[]) Predef$.MODULE$.refArrayOps((PipelineStage[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(handleCategorical).$plus$plus(Predef$.MODULE$.refArrayOps(handleCategorical2), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(PipelineStage.class)))).$plus$plus(Predef$.MODULE$.refArrayOps(handleCategorical3), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(PipelineStage.class)))).$plus$plus(Predef$.MODULE$.refArrayOps(new VectorAssembler[]{outputCol}), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(PipelineStage.class)))).$plus$plus(Predef$.MODULE$.refArrayOps(new RandomForestClassifier[]{randomForestClassifier}), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(PipelineStage.class))));
        PipelineModel fit = stages.fit(withColumnRenamed);
        Predef$.MODULE$.println(new StringBuilder().append("train accuracy with pipeline: ").append(BoxesRunTime.boxToDouble(accuracyScore(fit.transform(withColumnRenamed), "label", "prediction"))).toString());
        Predef$.MODULE$.println(new StringBuilder().append("test accuracy with pipeline: ").append(BoxesRunTime.boxToDouble(accuracyScore(fit.transform(dataset), "Survived", "prediction"))).toString());
        Model<?> crossValidation = crossValidation(stages, new ParamGridBuilder().addGrid(randomForestClassifier.impurity(), Predef$.MODULE$.wrapRefArray(new String[]{"gini", "entropy"})).addGrid(randomForestClassifier.maxDepth(), new int[]{1, 2}).addGrid(randomForestClassifier.minInstancesPerNode(), new int[]{1, 2}).build(), withColumnRenamed);
        Predef$.MODULE$.println(new StringBuilder().append("train accuracy with cross validation: ").append(BoxesRunTime.boxToDouble(accuracyScore(crossValidation.transform(withColumnRenamed), "label", "prediction"))).toString());
        Predef$.MODULE$.println(new StringBuilder().append("test accuracy with cross validation: ").append(BoxesRunTime.boxToDouble(accuracyScore(crossValidation.transform(dataset), "Survived", "prediction"))).toString());
        generateOutputFile(dataset.na().fill(d, new String[]{"age"}).na().fill(((Row) csv.agg(functions$.MODULE$.mean(csv.apply("Fare")), Predef$.MODULE$.wrapRefArray(new Column[0])).first()).getDouble(0), new String[]{"Fare"}), str2, crossValidation);
    }

    public void generateOutputFile(Dataset<Row> dataset, String str, Model<?> model) {
        Dataset transform = model.transform(dataset);
        transform.printSchema();
        Dataset select = transform.select("PassengerId", Predef$.MODULE$.wrapRefArray(new String[]{"prediction", "Survived"}));
        select.select(Predef$.MODULE$.wrapRefArray(new Column[]{select.apply("PassengerId"), select.apply("prediction").cast(IntegerType$.MODULE$).as("predicted"), select.apply("Survived").as("target")})).write().format("csv").option("header", "false").mode(SaveMode.Overwrite).save(str);
    }

    public Model<?> crossValidation(Pipeline pipeline, ParamMap[] paramMapArr, Dataset<Row> dataset) {
        return new CrossValidator().setEstimator(pipeline).setEvaluator(new BinaryClassificationEvaluator()).setEstimatorParamMaps(paramMapArr).setNumFolds(5).fit(dataset);
    }

    public PipelineStage[] handleCategorical(String str) {
        return new PipelineStage[]{new StringIndexer().setInputCol(str).setOutputCol(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", "_index"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{str}))).setHandleInvalid("skip"), new OneHotEncoder().setInputCol(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", "_index"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{str}))).setOutputCol(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", "_onehot"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{str})))};
    }

    public double accuracyScore(Dataset<Row> dataset, String str, String str2) {
        return new MulticlassMetrics(dataset.select(str2, Predef$.MODULE$.wrapRefArray(new String[]{str})).rdd().map(new Titanic$$anonfun$1(), ClassTag$.MODULE$.apply(Tuple2.class))).accuracy();
    }

    private Titanic$() {
        MODULE$ = this;
    }
}
