{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 10 - Parquet Crawler\n",
"\n",
"[awswrangler](https://github.com/aws/aws-sdk-pandas) can extract only the metadata from Parquet files and Partitions and then add it to the Glue Catalog."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import awswrangler as wr"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ············\n"
]
}
],
"source": [
"import getpass\n",
"bucket = getpass.getpass()\n",
"path = f\"s3://{bucket}/data/\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating a Parquet Table from the NOAA's CSV files\n",
"\n",
"[Reference](https://registry.opendata.aws/noaa-ghcn/)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" dt | \n",
" element | \n",
" value | \n",
" m_flag | \n",
" q_flag | \n",
" s_flag | \n",
" obs_time | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" TMAX | \n",
" 160 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" TMIN | \n",
" 30 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" PRCP | \n",
" 45 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" AGE00147705 | \n",
" 1890-01-01 | \n",
" TMAX | \n",
" 140 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" AGE00147705 | \n",
" 1890-01-01 | \n",
" TMIN | \n",
" 74 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 29249753 | \n",
" UZM00038457 | \n",
" 1899-12-31 | \n",
" PRCP | \n",
" 16 | \n",
" NaN | \n",
" NaN | \n",
" r | \n",
" NaN | \n",
"
\n",
" \n",
" 29249754 | \n",
" UZM00038457 | \n",
" 1899-12-31 | \n",
" TAVG | \n",
" -73 | \n",
" NaN | \n",
" NaN | \n",
" r | \n",
" NaN | \n",
"
\n",
" \n",
" 29249755 | \n",
" UZM00038618 | \n",
" 1899-12-31 | \n",
" TMIN | \n",
" -76 | \n",
" NaN | \n",
" NaN | \n",
" r | \n",
" NaN | \n",
"
\n",
" \n",
" 29249756 | \n",
" UZM00038618 | \n",
" 1899-12-31 | \n",
" PRCP | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" r | \n",
" NaN | \n",
"
\n",
" \n",
" 29249757 | \n",
" UZM00038618 | \n",
" 1899-12-31 | \n",
" TAVG | \n",
" -60 | \n",
" NaN | \n",
" NaN | \n",
" r | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
29249758 rows × 8 columns
\n",
"
"
],
"text/plain": [
" id dt element value m_flag q_flag s_flag obs_time\n",
"0 AGE00135039 1890-01-01 TMAX 160 NaN NaN E NaN\n",
"1 AGE00135039 1890-01-01 TMIN 30 NaN NaN E NaN\n",
"2 AGE00135039 1890-01-01 PRCP 45 NaN NaN E NaN\n",
"3 AGE00147705 1890-01-01 TMAX 140 NaN NaN E NaN\n",
"4 AGE00147705 1890-01-01 TMIN 74 NaN NaN E NaN\n",
"... ... ... ... ... ... ... ... ...\n",
"29249753 UZM00038457 1899-12-31 PRCP 16 NaN NaN r NaN\n",
"29249754 UZM00038457 1899-12-31 TAVG -73 NaN NaN r NaN\n",
"29249755 UZM00038618 1899-12-31 TMIN -76 NaN NaN r NaN\n",
"29249756 UZM00038618 1899-12-31 PRCP 0 NaN NaN r NaN\n",
"29249757 UZM00038618 1899-12-31 TAVG -60 NaN NaN r NaN\n",
"\n",
"[29249758 rows x 8 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = [\"id\", \"dt\", \"element\", \"value\", \"m_flag\", \"q_flag\", \"s_flag\", \"obs_time\"]\n",
"\n",
"df = wr.s3.read_csv(\n",
" path=\"s3://noaa-ghcn-pds/csv/by_year/189\",\n",
" names=cols,\n",
" parse_dates=[\"dt\", \"obs_time\"]) # Read 10 files from the 1890 decade (~1GB)\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" dt | \n",
" element | \n",
" value | \n",
" m_flag | \n",
" q_flag | \n",
" s_flag | \n",
" obs_time | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" TMAX | \n",
" 160 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
" 1890 | \n",
"
\n",
" \n",
" 1 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" TMIN | \n",
" 30 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
" 1890 | \n",
"
\n",
" \n",
" 2 | \n",
" AGE00135039 | \n",
" 1890-01-01 | \n",
" PRCP | \n",
" 45 | \n",
" NaN | \n",
" NaN | \n",
" E | \n",
" NaN | \n",
" 1890 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id dt element value m_flag q_flag s_flag obs_time year\n",
"0 AGE00135039 1890-01-01 TMAX 160 NaN NaN E NaN 1890\n",
"1 AGE00135039 1890-01-01 TMIN 30 NaN NaN E NaN 1890\n",
"2 AGE00135039 1890-01-01 PRCP 45 NaN NaN E NaN 1890"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"year\"] = df[\"dt\"].dt.year\n",
"\n",
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"res = wr.s3.to_parquet(\n",
" df=df,\n",
" path=path,\n",
" dataset=True,\n",
" mode=\"overwrite\",\n",
" partition_cols=[\"year\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['year=1890/06a519afcf8e48c9b08c8908f30adcfe.snappy.parquet',\n",
" 'year=1891/5a99c28dbef54008bfc770c946099e02.snappy.parquet',\n",
" 'year=1892/9b1ea5d1cfad40f78c920f93540ca8ec.snappy.parquet',\n",
" 'year=1893/92259b49c134401eaf772506ee802af6.snappy.parquet',\n",
" 'year=1894/c734469ffff944f69dc277c630064a16.snappy.parquet',\n",
" 'year=1895/cf7ccde86aaf4d138f86c379c0817aa6.snappy.parquet',\n",
" 'year=1896/ce02f4c2c554438786b766b33db451b6.snappy.parquet',\n",
" 'year=1897/e04de04ad3c444deadcc9c410ab97ca1.snappy.parquet',\n",
" 'year=1898/acb0e02878f04b56a6200f4b5a97be0e.snappy.parquet',\n",
" 'year=1899/a269bdbb0f6a48faac55f3bcfef7df7a.snappy.parquet']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[ x.split(\"data/\", 1)[1] for x in wr.s3.list_objects(path)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crawling!"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.81 s, sys: 528 ms, total: 2.33 s\n",
"Wall time: 3.21 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"res = wr.s3.store_parquet_metadata(\n",
" path=path,\n",
" database=\"awswrangler_test\",\n",
" table=\"crawler\",\n",
" dataset=True,\n",
" mode=\"overwrite\",\n",
" dtype={\"year\": \"int\"}\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Column Name | \n",
" Type | \n",
" Partition | \n",
" Comment | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" id | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 1 | \n",
" dt | \n",
" timestamp | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 2 | \n",
" element | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 3 | \n",
" value | \n",
" bigint | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 4 | \n",
" m_flag | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 5 | \n",
" q_flag | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 6 | \n",
" s_flag | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 7 | \n",
" obs_time | \n",
" string | \n",
" False | \n",
" | \n",
"
\n",
" \n",
" 8 | \n",
" year | \n",
" int | \n",
" True | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Column Name Type Partition Comment\n",
"0 id string False \n",
"1 dt timestamp False \n",
"2 element string False \n",
"3 value bigint False \n",
"4 m_flag string False \n",
"5 q_flag string False \n",
"6 s_flag string False \n",
"7 obs_time string False \n",
"8 year int True "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.table(database=\"awswrangler_test\", table=\"crawler\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.52 s, sys: 811 ms, total: 4.33 s\n",
"Wall time: 9.6 s\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" dt | \n",
" element | \n",
" value | \n",
" m_flag | \n",
" q_flag | \n",
" s_flag | \n",
" obs_time | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" USC00195145 | \n",
" 1890-01-01 | \n",
" TMIN | \n",
" -28 | \n",
" <NA> | \n",
" <NA> | \n",
" 6 | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 1 | \n",
" USC00196770 | \n",
" 1890-01-01 | \n",
" PRCP | \n",
" 0 | \n",
" P | \n",
" <NA> | \n",
" 6 | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 2 | \n",
" USC00196770 | \n",
" 1890-01-01 | \n",
" SNOW | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" 6 | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 3 | \n",
" USC00196915 | \n",
" 1890-01-01 | \n",
" PRCP | \n",
" 0 | \n",
" P | \n",
" <NA> | \n",
" 6 | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 4 | \n",
" USC00196915 | \n",
" 1890-01-01 | \n",
" SNOW | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" 6 | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 6139 | \n",
" ASN00022006 | \n",
" 1890-12-03 | \n",
" PRCP | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" a | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 6140 | \n",
" ASN00022007 | \n",
" 1890-12-03 | \n",
" PRCP | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" a | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 6141 | \n",
" ASN00022008 | \n",
" 1890-12-03 | \n",
" PRCP | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" a | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 6142 | \n",
" ASN00022009 | \n",
" 1890-12-03 | \n",
" PRCP | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" a | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
" 6143 | \n",
" ASN00022011 | \n",
" 1890-12-03 | \n",
" PRCP | \n",
" 0 | \n",
" <NA> | \n",
" <NA> | \n",
" a | \n",
" <NA> | \n",
" 1890 | \n",
"
\n",
" \n",
"
\n",
"
1276246 rows × 9 columns
\n",
"
"
],
"text/plain": [
" id dt element value m_flag q_flag s_flag obs_time \\\n",
"0 USC00195145 1890-01-01 TMIN -28 6 \n",
"1 USC00196770 1890-01-01 PRCP 0 P 6 \n",
"2 USC00196770 1890-01-01 SNOW 0 6 \n",
"3 USC00196915 1890-01-01 PRCP 0 P 6 \n",
"4 USC00196915 1890-01-01 SNOW 0 6 \n",
"... ... ... ... ... ... ... ... ... \n",
"6139 ASN00022006 1890-12-03 PRCP 0 a \n",
"6140 ASN00022007 1890-12-03 PRCP 0 a \n",
"6141 ASN00022008 1890-12-03 PRCP 0 a \n",
"6142 ASN00022009 1890-12-03 PRCP 0 a \n",
"6143 ASN00022011 1890-12-03 PRCP 0 a \n",
"\n",
" year \n",
"0 1890 \n",
"1 1890 \n",
"2 1890 \n",
"3 1890 \n",
"4 1890 \n",
"... ... \n",
"6139 1890 \n",
"6140 1890 \n",
"6141 1890 \n",
"6142 1890 \n",
"6143 1890 \n",
"\n",
"[1276246 rows x 9 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"wr.athena.read_sql_query(\"SELECT * FROM crawler WHERE year=1890\", database=\"awswrangler_test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up S3"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"wr.s3.delete_objects(path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up the Database"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n",
" wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}