# -*- coding: utf-8 -*- """ author SparkByExamples.com """ import pyspark from pyspark.sql import SparkSession spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() simpleData = (("James", "Sales", 3000), \ ("Michael", "Sales", 4600), \ ("Robert", "Sales", 4100), \ ("Maria", "Finance", 3000), \ ("James", "Sales", 3000), \ ("Scott", "Finance", 3300), \ ("Jen", "Finance", 3900), \ ("Jeff", "Marketing", 3000), \ ("Kumar", "Marketing", 2000),\ ("Saif", "Sales", 4100) \ ) columns= ["employee_name", "department", "salary"] df = spark.createDataFrame(data = simpleData, schema = columns) df.printSchema() df.show(truncate=False) from pyspark.sql.window import Window from pyspark.sql.functions import row_number windowSpec = Window.partitionBy("department").orderBy("salary") df.withColumn("row_number",row_number().over(windowSpec)) \ .show(truncate=False) from pyspark.sql.functions import rank df.withColumn("rank",rank().over(windowSpec)) \ .show() from pyspark.sql.functions import dense_rank df.withColumn("dense_rank",dense_rank().over(windowSpec)) \ .show() from pyspark.sql.functions import percent_rank df.withColumn("percent_rank",percent_rank().over(windowSpec)) \ .show() from pyspark.sql.functions import ntile df.withColumn("ntile",ntile(2).over(windowSpec)) \ .show() from pyspark.sql.functions import cume_dist df.withColumn("cume_dist",cume_dist().over(windowSpec)) \ .show() from pyspark.sql.functions import lag df.withColumn("lag",lag("salary",2).over(windowSpec)) \ .show() from pyspark.sql.functions import lead df.withColumn("lead",lead("salary",2).over(windowSpec)) \ .show() windowSpecAgg = Window.partitionBy("department") from pyspark.sql.functions import col,avg,sum,min,max,row_number df.withColumn("row",row_number().over(windowSpec)) \ .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \ .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \ .withColumn("min", min(col("salary")).over(windowSpecAgg)) \ .withColumn("max", max(col("salary")).over(windowSpecAgg)) \ .where(col("row")==1).select("department","avg","sum","min","max") \ .show()