{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 17 - Partition Projection\n",
"\n",
"https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import awswrangler as wr\n",
"import pandas as pd\n",
"from datetime import datetime\n",
"import getpass"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ···········································\n"
]
}
],
"source": [
"bucket = getpass.getpass()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Integer projection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" year | \n",
" month | \n",
" day | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2019 | \n",
" 10 | \n",
" 25 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2020 | \n",
" 11 | \n",
" 26 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 2021 | \n",
" 12 | \n",
" 27 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value year month day\n",
"0 1 2019 10 25\n",
"1 2 2020 11 26\n",
"2 3 2021 12 27"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" \"value\": [1, 2, 3],\n",
" \"year\": [2019, 2020, 2021],\n",
" \"month\": [10, 11, 12],\n",
" \"day\": [25, 26, 27]\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_integer/\",\n",
" dataset=True,\n",
" partition_cols=[\"year\", \"month\", \"day\"],\n",
" database=\"default\",\n",
" table=\"table_integer\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"year\": \"integer\",\n",
" \"month\": \"integer\",\n",
" \"day\": \"integer\"\n",
" },\n",
" \"projection_ranges\": {\n",
" \"year\": \"2000,2025\",\n",
" \"month\": \"1,12\",\n",
" \"day\": \"1,31\"\n",
" },\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" year | \n",
" month | \n",
" day | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3 | \n",
" 2021 | \n",
" 12 | \n",
" 27 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2020 | \n",
" 11 | \n",
" 26 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 2019 | \n",
" 10 | \n",
" 25 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value year month day\n",
"0 3 2021 12 27\n",
"1 2 2020 11 26\n",
"2 1 2019 10 25"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(f\"SELECT * FROM table_integer\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Enum projection"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" city | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" São Paulo | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Tokio | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Seattle | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value city\n",
"0 1 São Paulo\n",
"1 2 Tokio\n",
"2 3 Seattle"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" \"value\": [1, 2, 3],\n",
" \"city\": [\"São Paulo\", \"Tokio\", \"Seattle\"],\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_enum/\",\n",
" dataset=True,\n",
" partition_cols=[\"city\"],\n",
" database=\"default\",\n",
" table=\"table_enum\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"city\": \"enum\",\n",
" },\n",
" \"projection_values\": {\n",
" \"city\": \"São Paulo,Tokio,Seattle\"\n",
" },\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" city | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" São Paulo | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" Seattle | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" Tokio | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value city\n",
"0 1 São Paulo\n",
"1 3 Seattle\n",
"2 2 Tokio"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(f\"SELECT * FROM table_enum\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Date projection"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" dt | \n",
" ts | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2020-01-01 | \n",
" 2020-01-01 00:00:00 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2020-01-02 | \n",
" 2020-01-01 00:00:01 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 2020-01-03 | \n",
" 2020-01-01 00:00:02 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value dt ts\n",
"0 1 2020-01-01 2020-01-01 00:00:00\n",
"1 2 2020-01-02 2020-01-01 00:00:01\n",
"2 3 2020-01-03 2020-01-01 00:00:02"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ts = lambda x: datetime.strptime(x, \"%Y-%m-%d %H:%M:%S\")\n",
"dt = lambda x: datetime.strptime(x, \"%Y-%m-%d\").date()\n",
"\n",
"df = pd.DataFrame({\n",
" \"value\": [1, 2, 3],\n",
" \"dt\": [dt(\"2020-01-01\"), dt(\"2020-01-02\"), dt(\"2020-01-03\")],\n",
" \"ts\": [ts(\"2020-01-01 00:00:00\"), ts(\"2020-01-01 00:00:01\"), ts(\"2020-01-01 00:00:02\")],\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_date/\",\n",
" dataset=True,\n",
" partition_cols=[\"dt\", \"ts\"],\n",
" database=\"default\",\n",
" table=\"table_date\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"dt\": \"date\",\n",
" \"ts\": \"date\",\n",
" },\n",
" \"projection_ranges\": {\n",
" \"dt\": \"2020-01-01,2020-01-03\",\n",
" \"ts\": \"2020-01-01 00:00:00,2020-01-01 00:00:02\"\n",
" },\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" dt | \n",
" ts | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2020-01-01 | \n",
" 2020-01-01 00:00:00 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2020-01-02 | \n",
" 2020-01-01 00:00:01 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 2020-01-03 | \n",
" 2020-01-01 00:00:02 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value dt ts\n",
"0 1 2020-01-01 2020-01-01 00:00:00\n",
"1 2 2020-01-02 2020-01-01 00:00:01\n",
"2 3 2020-01-03 2020-01-01 00:00:02"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(f\"SELECT * FROM table_date\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Injected projection"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 761e2488-a078-11ea-bb37-0242ac130002 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" b89ed095-8179-4635-9537-88592c0f6bc3 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 87adc586-ce88-4f0a-b1c8-bf8e00d32249 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value uuid\n",
"0 1 761e2488-a078-11ea-bb37-0242ac130002\n",
"1 2 b89ed095-8179-4635-9537-88592c0f6bc3\n",
"2 3 87adc586-ce88-4f0a-b1c8-bf8e00d32249"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" \"value\": [1, 2, 3],\n",
" \"uuid\": [\"761e2488-a078-11ea-bb37-0242ac130002\", \"b89ed095-8179-4635-9537-88592c0f6bc3\", \"87adc586-ce88-4f0a-b1c8-bf8e00d32249\"],\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_injected/\",\n",
" dataset=True,\n",
" partition_cols=[\"uuid\"],\n",
" database=\"default\",\n",
" table=\"table_injected\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"uuid\": \"injected\",\n",
" }\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" b89ed095-8179-4635-9537-88592c0f6bc3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value uuid\n",
"0 2 b89ed095-8179-4635-9537-88592c0f6bc3"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(\n",
" sql=f\"SELECT * FROM table_injected WHERE uuid='b89ed095-8179-4635-9537-88592c0f6bc3'\",\n",
" database=\"default\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Cleaning Up"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.delete_objects(f\"s3://{bucket}/table_integer/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_enum/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_date/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_injected/\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.catalog.delete_table_if_exists(table=\"table_integer\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_enum\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_date\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_injected\", database=\"default\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
}
},
"nbformat": 4,
"nbformat_minor": 4
}