{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "21528ae2-45bb-4074-95e5-286b2aa30216", "metadata": { "deletable": false, "editable": true, "id": "21528ae2-45bb-4074-95e5-286b2aa30216", "trusted": true }, "outputs": [], "source": [ "%session_id_prefix native-iceberg-dataframe-\n", "%glue_version 3.0\n", "%idle_timeout 60\n", "%%configure \n", "{\n", " \"--conf\": \"spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\",\n", " \"--datalake-formats\": \"iceberg\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "9c175ad7-f103-47ec-9b81-6b045cc0bd40", "metadata": { "id": "9c175ad7-f103-47ec-9b81-6b045cc0bd40", "trusted": true }, "outputs": [], "source": [ "catalog_name = \"glue_catalog\"\n", "bucket_name = \"\"\n", "bucket_prefix = \"\"\n", "database_name = \"iceberg_dataframe\"\n", "table_name = \"product\"\n", "warehouse_path = f\"s3://{bucket_name}/{bucket_prefix}\"" ] }, { "cell_type": "markdown", "id": "c91cdff5", "metadata": {}, "source": [ "## Initialize SparkSession" ] }, { "cell_type": "code", "execution_count": null, "id": "d4819a61-8d81-4cec-80b2-fd6e3d029969", "metadata": { "id": "d4819a61-8d81-4cec-80b2-fd6e3d029969", "outputId": "953601c3-824a-4f9b-90f3-e692c776a7e6", "trusted": true }, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder \\\n", " .config(f\"spark.sql.catalog.{catalog_name}\", \"org.apache.iceberg.spark.SparkCatalog\") \\\n", " .config(f\"spark.sql.catalog.{catalog_name}.warehouse\", f\"{warehouse_path}\") \\\n", " .config(f\"spark.sql.catalog.{catalog_name}.catalog-impl\", \"org.apache.iceberg.aws.glue.GlueCatalog\") \\\n", " .config(f\"spark.sql.catalog.{catalog_name}.io-impl\", \"org.apache.iceberg.aws.s3.S3FileIO\") \\\n", " .config(\"spark.sql.extensions\",\"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\") \\\n", " .getOrCreate()" ] }, { "cell_type": "markdown", "id": "19d274a4-1610-43a6-a574-b18894a2f73b", "metadata": { "id": "19d274a4-1610-43a6-a574-b18894a2f73b" }, "source": [ "## Clean up existing resources" ] }, { "cell_type": "code", "execution_count": null, "id": "9ddef3b9-2b9f-42a8-a69b-ce4ecf637e49", "metadata": { "id": "9ddef3b9-2b9f-42a8-a69b-ce4ecf637e49", "outputId": "4b7cf09e-75fe-4d07-9c94-952942e99cc1", "trusted": true }, "outputs": [], "source": [ "query = f\"\"\"\n", "DROP TABLE IF EXISTS {catalog_name}.{database_name}.{table_name}\n", "\"\"\"\n", "spark.sql(query)" ] }, { "cell_type": "markdown", "id": "5078b110-be78-43fe-8ffe-93cd013c7b0c", "metadata": { "id": "5078b110-be78-43fe-8ffe-93cd013c7b0c" }, "source": [ "## Create Iceberg table with sample data" ] }, { "cell_type": "code", "execution_count": null, "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90", "metadata": { "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90", "outputId": "be3ab456-697d-4644-ef4a-cf0ebdc1679e", "trusted": true }, "outputs": [], "source": [ "query = f\"\"\"\n", "CREATE DATABASE IF NOT EXISTS {database_name}\n", "\"\"\"\n", "spark.sql(query)" ] }, { "cell_type": "code", "execution_count": null, "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90", "metadata": { "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90", "outputId": "be3ab456-697d-4644-ef4a-cf0ebdc1679e", "trusted": true }, "outputs": [], "source": [ "from pyspark.sql import Row\n", "import time\n", "\n", "ut = time.time()\n", "\n", "df_products = spark.createDataFrame(\n", " [\n", " (\"00001\", \"Heater\", 250, \"Electronics\", ut),\n", " (\"00002\", \"Thermostat\", 400, \"Electronics\", ut),\n", " (\"00003\", \"Television\", 600, \"Electronics\", ut),\n", " (\"00004\", \"Blender\", 100, \"Electronics\", ut),\n", " (\"00005\", \"Table\", 150, \"Furniture\", ut)\n", " ],\n", " [\"product_id\", \"product_name\", \"price\", \"category\", \"updated_at\"],\n", ")\n", "\n", "df_products.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "c08925d5-cdfb-4155-b337-472fe24f55d1", "metadata": { "id": "c08925d5-cdfb-4155-b337-472fe24f55d1", "outputId": "6cbc723e-539a-4293-8ef3-5e1411a3fadf", "trusted": true }, "outputs": [], "source": [ "df_products.sortWithinPartitions(\"category\") \\\n", " .writeTo(f\"{catalog_name}.{database_name}.{table_name}\") \\\n", " .create()" ] }, { "cell_type": "code", "execution_count": null, "id": "60a6395f-d63b-479e-b00e-3aa92e6a3210", "metadata": { "id": "60a6395f-d63b-479e-b00e-3aa92e6a3210", "trusted": true }, "outputs": [], "source": [ "spark.catalog.listTables(database_name)" ] }, { "cell_type": "markdown", "id": "48d42e63-fc96-4047-96fb-ad388f16a8bb", "metadata": { "id": "48d42e63-fc96-4047-96fb-ad388f16a8bb" }, "source": [ "## Read from Iceberg table" ] }, { "cell_type": "code", "execution_count": null, "id": "cb18a298-7934-46c4-a187-2960a77668d3", "metadata": { "id": "cb18a298-7934-46c4-a187-2960a77668d3", "outputId": "22b6e30c-e369-42f4-97f5-596bf06ae8bb", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n", " .show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5bbf8e99-ad2e-4216-ae41-9e4e2558f51e", "metadata": { "id": "5bbf8e99-ad2e-4216-ae41-9e4e2558f51e", "outputId": "d32dd590-6cc9-4f06-d58c-d4c9ae7e2dba", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n", " .show()" ] }, { "cell_type": "markdown", "id": "edc9062b-d134-4e78-879e-d9c51e85fdad", "metadata": { "id": "edc9062b-d134-4e78-879e-d9c51e85fdad", "tags": [] }, "source": [ "## Append records into Iceberg table" ] }, { "cell_type": "code", "execution_count": null, "id": "92624355-7e93-4c65-950c-3d0ccd0110b6", "metadata": { "id": "92624355-7e93-4c65-950c-3d0ccd0110b6", "outputId": "bc5f26c1-f381-460b-ca81-b8cd0b8c1864", "trusted": true }, "outputs": [], "source": [ "ut = time.time()\n", "\n", "df_products_appends = spark.createDataFrame(\n", " [\n", " (\"00006\", \"Chair\", 50, \"Furniture\", ut), \n", " (\"00007\", \"Desk\", 350, \"Furniture\", ut),\n", " ],\n", " [\"product_id\", \"product_name\", \"price\", \"category\", \"updated_at\"],\n", ")\n", "\n", "df_products_appends.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5f34efb1-b451-42e1-980d-8eb4b3f48e18", "metadata": { "id": "5f34efb1-b451-42e1-980d-8eb4b3f48e18", "outputId": "c3a91e23-5edf-422a-e4f1-2e6b899d8f11", "trusted": true }, "outputs": [], "source": [ "df_products_appends.writeTo(f\"{catalog_name}.{database_name}.{table_name}\").append()" ] }, { "cell_type": "code", "execution_count": null, "id": "dcc161d0-5ff9-4b99-b9d2-8e8faf669350", "metadata": { "id": "dcc161d0-5ff9-4b99-b9d2-8e8faf669350", "outputId": "b409efb4-1258-4a76-9866-711306090845", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n", " .show()" ] }, { "cell_type": "code", "execution_count": null, "id": "951e88f5-5d0a-444f-a85b-18f9736856c9", "metadata": { "id": "951e88f5-5d0a-444f-a85b-18f9736856c9", "outputId": "9afddf1c-acc4-473b-e998-a1058078710b", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n", " .show()" ] }, { "cell_type": "markdown", "id": "6dbba46c-ace4-4388-b17e-c3d12d50756c", "metadata": { "id": "6dbba46c-ace4-4388-b17e-c3d12d50756c" }, "source": [ "## Overwrite records into Iceberg table" ] }, { "cell_type": "code", "execution_count": null, "id": "555aba1f-8abb-4b28-9b93-93ed6937410a", "metadata": { "id": "555aba1f-8abb-4b28-9b93-93ed6937410a", "outputId": "e2fbfa24-a90f-42fb-9f0d-a3a18b61b140", "trusted": true }, "outputs": [], "source": [ "df_products = spark.table(f\"{catalog_name}.{database_name}.{table_name}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "018c1327-744f-4055-8d4f-cc666e67be03", "metadata": { "id": "018c1327-744f-4055-8d4f-cc666e67be03", "outputId": "75bbac08-f8cf-43b7-ceee-21e70e7cb355", "trusted": true }, "outputs": [], "source": [ "from pyspark.sql.functions import lit\n", "\n", "ut = time.time()\n", "\n", "df_products_overwrites = df_products \\\n", " .withColumn(\"price\", df_products.price*1.5) \\\n", " .withColumn(\"updated_at\", lit(ut))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "61ec5a97-f8e9-44fd-b2b8-5ddfdbcf0eac", "metadata": { "id": "61ec5a97-f8e9-44fd-b2b8-5ddfdbcf0eac", "outputId": "a30b5ad2-2ad7-4c94-bd04-cb5fee9088df", "trusted": true }, "outputs": [], "source": [ "df_products_overwrites.writeTo(f\"{catalog_name}.{database_name}.{table_name}\").overwritePartitions()" ] }, { "cell_type": "code", "execution_count": null, "id": "014ce305-aaaa-41f3-ab3f-02bd8508f63c", "metadata": { "id": "014ce305-aaaa-41f3-ab3f-02bd8508f63c", "outputId": "b2f3806a-8674-4636-a47a-3d5d7672c0f3", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n", " .show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5bae8d97-76f3-4b4c-b863-0ffb87ce4ce8", "metadata": { "id": "5bae8d97-76f3-4b4c-b863-0ffb87ce4ce8", "outputId": "e9560b49-5524-47ea-d276-d4f07f64146c", "trusted": true }, "outputs": [], "source": [ "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n", " .show()" ] }, { "cell_type": "markdown", "id": "6e114b41", "metadata": {}, "source": [ "## Stop Session" ] }, { "cell_type": "code", "execution_count": null, "id": "539c31ae-9c97-46da-8bb6-cdface9758a4", "metadata": { "id": "539c31ae-9c97-46da-8bb6-cdface9758a4", "outputId": "7e048cbf-f37a-45d2-87f0-221c3f61a529", "trusted": true }, "outputs": [], "source": [ "%stop_session" ] } ], "metadata": { "colab": { "name": "iceberg_df.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Glue PySpark", "language": "python", "name": "glue_pyspark" } }, "nbformat": 4, "nbformat_minor": 5 }