# -*- coding: utf-8 -*- """ author SparkByExamples.com """ from pyspark.sql import SparkSession,Row spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() data=[("James",23),("Ann",40)] df=spark.createDataFrame(data).toDF("name.fname","gender") df.printSchema() df.show() from pyspark.sql.functions import col df.select(col("`name.fname`")).show() df.select(df["`name.fname`"]).show() df.withColumn("new_col",col("`name.fname`").substr(1,2)).show() df.filter(col("`name.fname`").startswith("J")).show() new_cols=(column.replace('.', '_') for column in df.columns) df2 = df.toDF(*new_cols) df2.show() # Using DataFrame object df.select(df.gender).show() df.select(df["gender"]).show() #Accessing column name with dot (with backticks) df.select(df["`name.fname`"]).show() #Using SQL col() function from pyspark.sql.functions import col df.select(col("gender")).show() #Accessing column name with dot (with backticks) df.select(col("`name.fname`")).show() #Access struct column data=[Row(name="James",prop=Row(hair="black",eye="blue")), Row(name="Ann",prop=Row(hair="grey",eye="black"))] df=spark.createDataFrame(data) df.printSchema() df.select(df.prop.hair).show() df.select(df["prop.hair"]).show() df.select(col("prop.hair")).show() df.select(col("prop.*")).show() # Column operators data=[(100,2,1),(200,3,4),(300,4,4)] df=spark.createDataFrame(data).toDF("col1","col2","col3") df.select(df.col1 + df.col2).show() df.select(df.col1 - df.col2).show() df.select(df.col1 * df.col2).show() df.select(df.col1 / df.col2).show() df.select(df.col1 % df.col2).show() df.select(df.col2 > df.col3).show() df.select(df.col2 < df.col3).show() df.select(df.col2 == df.col3).show()