# -*- coding: utf-8 -*- """ author SparkByExamples.com """ import pyspark from pyspark.sql import SparkSession spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() simpleData = [("James","Sales","NY",90000,34,10000), \ ("Michael","Sales","NY",86000,56,20000), \ ("Robert","Sales","CA",81000,30,23000), \ ("Maria","Finance","CA",90000,24,23000) \ ] columns= ["employee_name","department","state","salary","age","bonus"] df = spark.createDataFrame(data = simpleData, schema = columns) df.printSchema() df.show(truncate=False) simpleData2 = [("James","Sales","NY",90000,34,10000), \ ("Maria","Finance","CA",90000,24,23000), \ ("Jen","Finance","NY",79000,53,15000), \ ("Jeff","Marketing","CA",80000,25,18000), \ ("Kumar","Marketing","NY",91000,50,21000) \ ] columns2= ["employee_name","department","state","salary","age","bonus"] df2 = spark.createDataFrame(data = simpleData2, schema = columns2) df2.printSchema() df2.show(truncate=False) unionDF = df.union(df2) unionDF.show(truncate=False) disDF = df.union(df2).distinct() disDF.show(truncate=False) unionAllDF = df.unionAll(df2) unionAllDF.show(truncate=False)