{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 10 - Parquet Crawler\n", "\n", "[awswrangler](https://github.com/aws/aws-sdk-pandas) can extract only the metadata from Parquet files and Partitions and then add it to the Glue Catalog." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import awswrangler as wr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ " ············\n" ] } ], "source": [ "import getpass\n", "bucket = getpass.getpass()\n", "path = f\"s3://{bucket}/data/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating a Parquet Table from the NOAA's CSV files\n", "\n", "[Reference](https://registry.opendata.aws/noaa-ghcn/)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddtelementvaluem_flagq_flags_flagobs_time
0AGE001350391890-01-01TMAX160NaNNaNENaN
1AGE001350391890-01-01TMIN30NaNNaNENaN
2AGE001350391890-01-01PRCP45NaNNaNENaN
3AGE001477051890-01-01TMAX140NaNNaNENaN
4AGE001477051890-01-01TMIN74NaNNaNENaN
...........................
29249753UZM000384571899-12-31PRCP16NaNNaNrNaN
29249754UZM000384571899-12-31TAVG-73NaNNaNrNaN
29249755UZM000386181899-12-31TMIN-76NaNNaNrNaN
29249756UZM000386181899-12-31PRCP0NaNNaNrNaN
29249757UZM000386181899-12-31TAVG-60NaNNaNrNaN
\n", "

29249758 rows × 8 columns

\n", "
" ], "text/plain": [ " id dt element value m_flag q_flag s_flag obs_time\n", "0 AGE00135039 1890-01-01 TMAX 160 NaN NaN E NaN\n", "1 AGE00135039 1890-01-01 TMIN 30 NaN NaN E NaN\n", "2 AGE00135039 1890-01-01 PRCP 45 NaN NaN E NaN\n", "3 AGE00147705 1890-01-01 TMAX 140 NaN NaN E NaN\n", "4 AGE00147705 1890-01-01 TMIN 74 NaN NaN E NaN\n", "... ... ... ... ... ... ... ... ...\n", "29249753 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n", "29249754 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n", "29249755 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n", "29249756 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n", "29249757 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n", "\n", "[29249758 rows x 8 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = [\"id\", \"dt\", \"element\", \"value\", \"m_flag\", \"q_flag\", \"s_flag\", \"obs_time\"]\n", "\n", "df = wr.s3.read_csv(\n", " path=\"s3://noaa-ghcn-pds/csv/by_year/189\",\n", " names=cols,\n", " parse_dates=[\"dt\", \"obs_time\"]) # Read 10 files from the 1890 decade (~1GB)\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddtelementvaluem_flagq_flags_flagobs_timeyear
0AGE001350391890-01-01TMAX160NaNNaNENaN1890
1AGE001350391890-01-01TMIN30NaNNaNENaN1890
2AGE001350391890-01-01PRCP45NaNNaNENaN1890
\n", "
" ], "text/plain": [ " id dt element value m_flag q_flag s_flag obs_time year\n", "0 AGE00135039 1890-01-01 TMAX 160 NaN NaN E NaN 1890\n", "1 AGE00135039 1890-01-01 TMIN 30 NaN NaN E NaN 1890\n", "2 AGE00135039 1890-01-01 PRCP 45 NaN NaN E NaN 1890" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"year\"] = df[\"dt\"].dt.year\n", "\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "res = wr.s3.to_parquet(\n", " df=df,\n", " path=path,\n", " dataset=True,\n", " mode=\"overwrite\",\n", " partition_cols=[\"year\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['year=1890/06a519afcf8e48c9b08c8908f30adcfe.snappy.parquet',\n", " 'year=1891/5a99c28dbef54008bfc770c946099e02.snappy.parquet',\n", " 'year=1892/9b1ea5d1cfad40f78c920f93540ca8ec.snappy.parquet',\n", " 'year=1893/92259b49c134401eaf772506ee802af6.snappy.parquet',\n", " 'year=1894/c734469ffff944f69dc277c630064a16.snappy.parquet',\n", " 'year=1895/cf7ccde86aaf4d138f86c379c0817aa6.snappy.parquet',\n", " 'year=1896/ce02f4c2c554438786b766b33db451b6.snappy.parquet',\n", " 'year=1897/e04de04ad3c444deadcc9c410ab97ca1.snappy.parquet',\n", " 'year=1898/acb0e02878f04b56a6200f4b5a97be0e.snappy.parquet',\n", " 'year=1899/a269bdbb0f6a48faac55f3bcfef7df7a.snappy.parquet']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[ x.split(\"data/\", 1)[1] for x in wr.s3.list_objects(path)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Crawling!" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.81 s, sys: 528 ms, total: 2.33 s\n", "Wall time: 3.21 s\n" ] } ], "source": [ "%%time\n", "\n", "res = wr.s3.store_parquet_metadata(\n", " path=path,\n", " database=\"awswrangler_test\",\n", " table=\"crawler\",\n", " dataset=True,\n", " mode=\"overwrite\",\n", " dtype={\"year\": \"int\"}\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Column NameTypePartitionComment
0idstringFalse
1dttimestampFalse
2elementstringFalse
3valuebigintFalse
4m_flagstringFalse
5q_flagstringFalse
6s_flagstringFalse
7obs_timestringFalse
8yearintTrue
\n", "
" ], "text/plain": [ " Column Name Type Partition Comment\n", "0 id string False \n", "1 dt timestamp False \n", "2 element string False \n", "3 value bigint False \n", "4 m_flag string False \n", "5 q_flag string False \n", "6 s_flag string False \n", "7 obs_time string False \n", "8 year int True " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.catalog.table(database=\"awswrangler_test\", table=\"crawler\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.52 s, sys: 811 ms, total: 4.33 s\n", "Wall time: 9.6 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddtelementvaluem_flagq_flags_flagobs_timeyear
0USC001951451890-01-01TMIN-28<NA><NA>6<NA>1890
1USC001967701890-01-01PRCP0P<NA>6<NA>1890
2USC001967701890-01-01SNOW0<NA><NA>6<NA>1890
3USC001969151890-01-01PRCP0P<NA>6<NA>1890
4USC001969151890-01-01SNOW0<NA><NA>6<NA>1890
..............................
6139ASN000220061890-12-03PRCP0<NA><NA>a<NA>1890
6140ASN000220071890-12-03PRCP0<NA><NA>a<NA>1890
6141ASN000220081890-12-03PRCP0<NA><NA>a<NA>1890
6142ASN000220091890-12-03PRCP0<NA><NA>a<NA>1890
6143ASN000220111890-12-03PRCP0<NA><NA>a<NA>1890
\n", "

1276246 rows × 9 columns

\n", "
" ], "text/plain": [ " id dt element value m_flag q_flag s_flag obs_time \\\n", "0 USC00195145 1890-01-01 TMIN -28 6 \n", "1 USC00196770 1890-01-01 PRCP 0 P 6 \n", "2 USC00196770 1890-01-01 SNOW 0 6 \n", "3 USC00196915 1890-01-01 PRCP 0 P 6 \n", "4 USC00196915 1890-01-01 SNOW 0 6 \n", "... ... ... ... ... ... ... ... ... \n", "6139 ASN00022006 1890-12-03 PRCP 0 a \n", "6140 ASN00022007 1890-12-03 PRCP 0 a \n", "6141 ASN00022008 1890-12-03 PRCP 0 a \n", "6142 ASN00022009 1890-12-03 PRCP 0 a \n", "6143 ASN00022011 1890-12-03 PRCP 0 a \n", "\n", " year \n", "0 1890 \n", "1 1890 \n", "2 1890 \n", "3 1890 \n", "4 1890 \n", "... ... \n", "6139 1890 \n", "6140 1890 \n", "6141 1890 \n", "6142 1890 \n", "6143 1890 \n", "\n", "[1276246 rows x 9 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "wr.athena.read_sql_query(\"SELECT * FROM crawler WHERE year=1890\", database=\"awswrangler_test\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up S3" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "wr.s3.delete_objects(path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up the Database" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n", " wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }