{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 12 - CSV Crawler\n", "\n", "[awswrangler](https://github.com/aws/aws-sdk-pandas) can extract only the metadata from a Pandas DataFrame and then add it can be added to Glue Catalog as a table." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import awswrangler as wr\n", "from datetime import datetime\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ " ············\n" ] } ], "source": [ "import getpass\n", "bucket = getpass.getpass()\n", "path = f\"s3://{bucket}/csv_crawler/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating a Pandas DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idstringfloatdatetimestampboolpar0par1
01foo1.02020-01-012020-01-01 00:00:00True1a
12NoneNaNNoneNaTNone1b
23boo2.02020-01-022020-01-02 00:00:01False2b
\n", "
" ], "text/plain": [ " id string float date timestamp bool par0 par1\n", "0 1 foo 1.0 2020-01-01 2020-01-01 00:00:00 True 1 a\n", "1 2 None NaN None NaT None 1 b\n", "2 3 boo 2.0 2020-01-02 2020-01-02 00:00:01 False 2 b" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ts = lambda x: datetime.strptime(x, \"%Y-%m-%d %H:%M:%S.%f\") # noqa\n", "dt = lambda x: datetime.strptime(x, \"%Y-%m-%d\").date() # noqa\n", "\n", "df = pd.DataFrame(\n", " {\n", " \"id\": [1, 2, 3],\n", " \"string\": [\"foo\", None, \"boo\"],\n", " \"float\": [1.0, None, 2.0],\n", " \"date\": [dt(\"2020-01-01\"), None, dt(\"2020-01-02\")],\n", " \"timestamp\": [ts(\"2020-01-01 00:00:00.0\"), None, ts(\"2020-01-02 00:00:01.0\")],\n", " \"bool\": [True, None, False],\n", " \"par0\": [1, 1, 2],\n", " \"par1\": [\"a\", \"b\", \"b\"],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extracting the metadata" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "columns_types, partitions_types = wr.catalog.extract_athena_types(\n", " df=df,\n", " file_format=\"csv\",\n", " index=False,\n", " partition_cols=[\"par0\", \"par1\"]\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'bigint',\n", " 'string': 'string',\n", " 'float': 'double',\n", " 'date': 'date',\n", " 'timestamp': 'timestamp',\n", " 'bool': 'boolean'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_types" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'par0': 'bigint', 'par1': 'string'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "partitions_types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating the table" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "wr.catalog.create_csv_table(\n", " table=\"csv_crawler\",\n", " database=\"awswrangler_test\",\n", " path=path,\n", " partitions_types=partitions_types,\n", " columns_types=columns_types,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Column NameTypePartitionComment
0idbigintFalse
1stringstringFalse
2floatdoubleFalse
3datedateFalse
4timestamptimestampFalse
5boolbooleanFalse
6par0bigintTrue
7par1stringTrue
\n", "
" ], "text/plain": [ " Column Name Type Partition Comment\n", "0 id bigint False \n", "1 string string False \n", "2 float double False \n", "3 date date False \n", "4 timestamp timestamp False \n", "5 bool boolean False \n", "6 par0 bigint True \n", "7 par1 string True " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.table(database=\"awswrangler_test\", table=\"csv_crawler\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## We can still using the extracted metadata to ensure all data types consistence to new data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idstringfloatdatetimestampboolpar0par1
01112020-01-012020-01-0211a
\n", "
" ], "text/plain": [ " id string float date timestamp bool par0 par1\n", "0 1 1 1 2020-01-01 2020-01-02 1 1 a" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"id\": [1],\n", " \"string\": [\"1\"],\n", " \"float\": [1],\n", " \"date\": [ts(\"2020-01-01 00:00:00.0\")],\n", " \"timestamp\": [dt(\"2020-01-02\")],\n", " \"bool\": [1],\n", " \"par0\": [1],\n", " \"par1\": [\"a\"],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "res = wr.s3.to_csv(\n", " df=df,\n", " path=path,\n", " index=False,\n", " dataset=True,\n", " database=\"awswrangler_test\",\n", " table=\"csv_crawler\",\n", " partition_cols=[\"par0\", \"par1\"],\n", " dtype=columns_types\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## You can also extract the metadata directly from the Catalog if you want" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dtype = wr.catalog.get_table_types(database=\"awswrangler_test\", table=\"csv_crawler\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "res = wr.s3.to_csv(\n", " df=df,\n", " path=path,\n", " index=False,\n", " dataset=True,\n", " database=\"awswrangler_test\",\n", " table=\"csv_crawler\",\n", " partition_cols=[\"par0\", \"par1\"],\n", " dtype=dtype\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking out" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idstringfloatdatetimestampboolpar0par1
0111.0None2020-01-02True1a
1111.0None2020-01-02True1a
\n", "
" ], "text/plain": [ " id string float date timestamp bool par0 par1\n", "0 1 1 1.0 None 2020-01-02 True 1 a\n", "1 1 1 1.0 None 2020-01-02 True 1 a" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = wr.athena.read_sql_table(database=\"awswrangler_test\", table=\"csv_crawler\")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id Int64\n", "string string\n", "float float64\n", "date object\n", "timestamp datetime64[ns]\n", "bool boolean\n", "par0 Int64\n", "par1 string\n", "dtype: object" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up S3" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "wr.s3.delete_objects(path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up the Database" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=\"csv_crawler\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" } }, "nbformat": 4, "nbformat_minor": 4 }