{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 5 - Glue Catalog\n",
"\n",
"[awswrangler](https://github.com/aws/aws-sdk-pandas) makes heavy use of [Glue Catalog](https://aws.amazon.com/glue/) to store metadata of tables and connections."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import awswrangler as wr\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ············\n"
]
}
],
"source": [
"import getpass\n",
"bucket = getpass.getpass()\n",
"path = f\"s3://{bucket}/data/\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating a Pandas DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" name | \n",
" price | \n",
" in_stock | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" shoes | \n",
" 50.3 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" tshirt | \n",
" 10.5 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" ball | \n",
" 20.0 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id name price in_stock\n",
"0 1 shoes 50.3 True\n",
"1 2 tshirt 10.5 True\n",
"2 3 ball 20.0 False"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" \"id\": [1, 2, 3],\n",
" \"name\": [\"shoes\", \"tshirt\", \"ball\"],\n",
" \"price\": [50.3, 10.5, 20.0],\n",
" \"in_stock\": [True, True, False]\n",
"})\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking Glue Catalog Databases"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Database Description\n",
"0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n",
"1 default Default Hive database\n"
]
}
],
"source": [
"databases = wr.catalog.databases()\n",
"print(databases)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the database awswrangler_test if not exists"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Database Description\n",
"0 aws_sdk_pandas AWS SDK for pandas Test Arena - Glue Database\n",
"1 awswrangler_test \n",
"2 default Default Hive database\n"
]
}
],
"source": [
"if \"awswrangler_test\" not in databases.values:\n",
" wr.catalog.create_database(\"awswrangler_test\")\n",
" print(wr.catalog.databases())\n",
"else:\n",
" print(\"Database awswrangler_test already exists\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking the empty database"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Database | \n",
" Table | \n",
" Description | \n",
" Columns | \n",
" Partitions | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Database, Table, Description, Columns, Partitions]\n",
"Index: []"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(database=\"awswrangler_test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Writing DataFrames to Data Lake (S3 + Parquet + Glue Catalog)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"desc = \"This is my product table.\"\n",
"\n",
"param = {\n",
" \"source\": \"Product Web Service\",\n",
" \"class\": \"e-commerce\"\n",
"}\n",
"\n",
"comments = {\n",
" \"id\": \"Unique product ID.\",\n",
" \"name\": \"Product name\",\n",
" \"price\": \"Product price (dollar)\",\n",
" \"in_stock\": \"Is this product availaible in the stock?\"\n",
"}\n",
"\n",
"res = wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/products/\",\n",
" dataset=True,\n",
" database=\"awswrangler_test\",\n",
" table=\"products\",\n",
" mode=\"overwrite\",\n",
" glue_table_settings=wr.typing.GlueTableSettings(\n",
" description=desc,\n",
" parameters=param,\n",
" columns_comments=comments\n",
" ),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checking Glue Catalog (AWS Console)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Looking Up for the new table!"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Database | \n",
" Table | \n",
" Description | \n",
" Columns | \n",
" Partitions | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" awswrangler_test | \n",
" products | \n",
" This is my product table. | \n",
" id, name, price, in_stock | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_contains=\"roduc\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Database | \n",
" Table | \n",
" Description | \n",
" Columns | \n",
" Partitions | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" awswrangler_test | \n",
" products | \n",
" This is my product table. | \n",
" id, name, price, in_stock | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_prefix=\"pro\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Database | \n",
" Table | \n",
" Description | \n",
" Columns | \n",
" Partitions | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" awswrangler_test | \n",
" products | \n",
" This is my product table. | \n",
" id, name, price, in_stock | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(name_suffix=\"ts\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Database | \n",
" Table | \n",
" Description | \n",
" Columns | \n",
" Partitions | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" awswrangler_test | \n",
" products | \n",
" This is my product table. | \n",
" id, name, price, in_stock | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Database Table Description \\\n",
"0 awswrangler_test products This is my product table. \n",
"\n",
" Columns Partitions \n",
"0 id, name, price, in_stock "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.tables(search_text=\"This is my\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Getting tables details"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Column Name | \n",
" Type | \n",
" Partition | \n",
" Comment | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" id | \n",
" bigint | \n",
" False | \n",
" Unique product ID. | \n",
"
\n",
" \n",
" 1 | \n",
" name | \n",
" string | \n",
" False | \n",
" Product name | \n",
"
\n",
" \n",
" 2 | \n",
" price | \n",
" double | \n",
" False | \n",
" Product price (dollar) | \n",
"
\n",
" \n",
" 3 | \n",
" in_stock | \n",
" boolean | \n",
" False | \n",
" Is this product availaible in the stock? | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Column Name Type Partition Comment\n",
"0 id bigint False Unique product ID.\n",
"1 name string False Product name\n",
"2 price double False Product price (dollar)\n",
"3 in_stock boolean False Is this product availaible in the stock?"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.table(database=\"awswrangler_test\", table=\"products\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up the Database"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n",
" wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete Database"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"wr.catalog.delete_database('awswrangler_test')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13 (main, Aug 2 2022, 15:07:42) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
},
"vscode": {
"interpreter": {
"hash": "bd595004b250e5f4145a0d632609b0d8f97d1ccd278d58fafd6840c0467021f9"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}