# -*- coding: utf-8 -*- """ author SparkByExamples.com """ from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName('SparkByExamples.com') \ .getOrCreate() data = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100) ] columns = ["Name","Dept","Salary"] df = spark.createDataFrame(data=data,schema=columns) df.distinct().show() print("Distinct Count: " + str(df.distinct().count())) # Using countDistrinct() from pyspark.sql.functions import countDistinct df2=df.select(countDistinct("Dept","Salary")) df2.show() print("Distinct Count of Department & Salary: "+ str(df2.collect()[0][0])) df.createOrReplaceTempView("PERSON") spark.sql("select distinct(count(*)) from PERSON").show()