# -*- coding: utf-8 -*- """ author SparkByExamples.com """ from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName('SparkByExamples.com') \ .getOrCreate() data = [('James','Smith','M',3000), ('Anna','Rose','F',4100), ('Robert','Williams','M',6200), ] columns = ["firstname","lastname","gender","salary"] df = spark.createDataFrame(data=data, schema = columns) df.show() if 'salary1' not in df.columns: print("aa") # Add new constanct column from pyspark.sql.functions import lit df.withColumn("bonus_percent", lit(0.3)) \ .show() #Add column from existing column df.withColumn("bonus_amount", df.salary*0.3) \ .show() #Add column by concatinating existing columns from pyspark.sql.functions import concat_ws df.withColumn("name", concat_ws(",","firstname",'lastname')) \ .show() #Add current date from pyspark.sql.functions import current_date df.withColumn("current_date", current_date()) \ .show() from pyspark.sql.functions import when df.withColumn("grade", \ when((df.salary < 4000), lit("A")) \ .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \ .otherwise(lit("C")) \ ).show() # Add column using select df.select("firstname","salary", lit(0.3).alias("bonus")).show() df.select("firstname","salary", lit(df.salary * 0.3).alias("bonus_amount")).show() df.select("firstname","salary", current_date().alias("today_date")).show() #Add columns using SQL df.createOrReplaceTempView("PER") spark.sql("select firstname,salary, '0.3' as bonus from PER").show() spark.sql("select firstname,salary, salary * 0.3 as bonus_amount from PER").show() spark.sql("select firstname,salary, current_date() as today_date from PER").show() spark.sql("select firstname,salary, " + "case salary when salary < 4000 then 'A' "+ "else 'B' END as grade from PER").show()