# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0 import sys from datetime import datetime from pyspark.sql import SparkSession from pyspark.sql.functions import * if __name__ == "__main__": print(len(sys.argv)) if (len(sys.argv) != 3): print("Usage: spark-etl [input-folder] [output-folder]") sys.exit(0) spark = SparkSession\ .builder\ .appName("SparkETL")\ .getOrCreate() nyTaxi = spark.read.option("inferSchema", "true").option("header", "true").csv(sys.argv[1]) updatedNYTaxi = nyTaxi.withColumn("current_date", lit(datetime.now())) updatedNYTaxi.printSchema() print(updatedNYTaxi.show()) print("Total number of records: " + str(updatedNYTaxi.count())) updatedNYTaxi.write.parquet(sys.argv[2])