{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 5 - Glue Catalog\n", "\n", "[awswrangler](https://github.com/aws/aws-sdk-pandas) makes heavy use of [Glue Catalog](https://aws.amazon.com/glue/) to store metadata of tables and connections." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import awswrangler as wr\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ············\n" ] } ], "source": [ "import getpass\n", "bucket = getpass.getpass()\n", "path = f\"s3://{bucket}/data/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating a Pandas DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamepricein_stock
01shoes50.3True
12tshirt10.5True
23ball20.0False
\n", "
" ], "text/plain": [ " id name price in_stock\n", "0 1 shoes 50.3 True\n", "1 2 tshirt 10.5 True\n", "2 3 ball 20.0 False" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " \"id\": [1, 2, 3],\n", " \"name\": [\"shoes\", \"tshirt\", \"ball\"],\n", " \"price\": [50.3, 10.5, 20.0],\n", " \"in_stock\": [True, True, False]\n", "})\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking Glue Catalog Databases" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Database Description\n", "0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n", "1 default Default Hive database\n" ] } ], "source": [ "databases = wr.catalog.databases()\n", "print(databases)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create the database awswrangler_test if not exists" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Database Description\n", "0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n", "1 awswrangler_test \n", "2 default Default Hive database\n" ] } ], "source": [ "if \"awswrangler_test\" not in databases.values:\n", " wr.catalog.create_database(\"awswrangler_test\")\n", " print(wr.catalog.databases())\n", "else:\n", " print(\"Database awswrangler_test already exists\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking the empty database" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatabaseTableDescriptionColumnsPartitions
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Database, Table, Description, Columns, Partitions]\n", "Index: []" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.tables(database=\"awswrangler_test\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Writing DataFrames to Data Lake (S3 + Parquet + Glue Catalog)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "desc = \"This is my product table.\"\n", "\n", "param = {\n", " \"source\": \"Product Web Service\",\n", " \"class\": \"e-commerce\"\n", "}\n", "\n", "comments = {\n", " \"id\": \"Unique product ID.\",\n", " \"name\": \"Product name\",\n", " \"price\": \"Product price (dollar)\",\n", " \"in_stock\": \"Is this product availaible in the stock?\"\n", "}\n", "\n", "res = wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/products/\",\n", " dataset=True,\n", " database=\"awswrangler_test\",\n", " table=\"products\",\n", " mode=\"overwrite\",\n", " glue_table_settings=wr.typing.GlueTableSettings(\n", " description=desc,\n", " parameters=param,\n", " columns_comments=comments\n", " ),\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checking Glue Catalog (AWS Console)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Glue Console](_static/glue_catalog_table_products.png \"Glue Console\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Looking Up for the new table!" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatabaseTableDescriptionColumnsPartitions
0awswrangler_testproductsThis is my product table.id, name, price, in_stock
\n", "
" ], "text/plain": [ " Database Table Description \\\n", "0 awswrangler_test products This is my product table. \n", "\n", " Columns Partitions \n", "0 id, name, price, in_stock " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.tables(name_contains=\"roduc\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatabaseTableDescriptionColumnsPartitions
0awswrangler_testproductsThis is my product table.id, name, price, in_stock
\n", "
" ], "text/plain": [ " Database Table Description \\\n", "0 awswrangler_test products This is my product table. \n", "\n", " Columns Partitions \n", "0 id, name, price, in_stock " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.tables(name_prefix=\"pro\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatabaseTableDescriptionColumnsPartitions
0awswrangler_testproductsThis is my product table.id, name, price, in_stock
\n", "
" ], "text/plain": [ " Database Table Description \\\n", "0 awswrangler_test products This is my product table. \n", "\n", " Columns Partitions \n", "0 id, name, price, in_stock " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.tables(name_suffix=\"ts\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatabaseTableDescriptionColumnsPartitions
0awswrangler_testproductsThis is my product table.id, name, price, in_stock
\n", "
" ], "text/plain": [ " Database Table Description \\\n", "0 awswrangler_test products This is my product table. \n", "\n", " Columns Partitions \n", "0 id, name, price, in_stock " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.tables(search_text=\"This is my\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Getting tables details" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Column NameTypePartitionComment
0idbigintFalseUnique product ID.
1namestringFalseProduct name
2pricedoubleFalseProduct price (dollar)
3in_stockbooleanFalseIs this product availaible in the stock?
\n", "
" ], "text/plain": [ " Column Name Type Partition Comment\n", "0 id bigint False Unique product ID.\n", "1 name string False Product name\n", "2 price double False Product price (dollar)\n", "3 in_stock boolean False Is this product availaible in the stock?" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.table(database=\"awswrangler_test\", table=\"products\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up the Database" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n", " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Delete Database" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "wr.catalog.delete_database('awswrangler_test')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13 (main, Aug 2 2022, 15:07:42) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { "hash": "bd595004b250e5f4145a0d632609b0d8f97d1ccd278d58fafd6840c0467021f9" } } }, "nbformat": 4, "nbformat_minor": 4 }