# -*- coding: utf-8 -*- """ author SparkByExamples.com """ from pyspark.sql import SparkSession from pyspark.sql.functions import col,sum,avg,max spark = SparkSession.builder \ .appName('SparkByExamples.com') \ .getOrCreate() simpleData = [("James","Sales","NY",90000,34,10000), ("Michael","Sales","NV",86000,56,20000), ("Robert","Sales","CA",81000,30,23000), ("Maria","Finance","CA",90000,24,23000), ("Raman","Finance","DE",99000,40,24000), ("Scott","Finance","NY",83000,36,19000), ("Jen","Finance","NY",79000,53,15000), ("Jeff","Marketing","NV",80000,25,18000), ("Kumar","Marketing","NJ",91000,50,21000) ] schema = ["employee_name","department","state","salary","age","bonus"] df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) dfSort=df.sort(df.state,df.salary).groupBy(df.state).agg(sum(df.salary)) dfSort.show()