{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 25 - Redshift - Loading Parquet files with Spectrum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install the optional modules first\n", "!pip install 'awswrangler[redshift]'" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ···········································\n" ] } ], "source": [ "import getpass\n", "bucket = getpass.getpass()\n", "PATH = f\"s3://{bucket}/files/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mocking some Parquet Files on S3" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col0col1
00a
11b
22c
33d
44e
55f
66g
77h
88i
99j
\n", "
" ], "text/plain": [ " col0 col1\n", "0 0 a\n", "1 1 b\n", "2 2 c\n", "3 3 d\n", "4 4 e\n", "5 5 f\n", "6 6 g\n", "7 7 h\n", "8 8 i\n", "9 9 j" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import awswrangler as wr\n", "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " \"col0\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", " \"col1\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"],\n", "})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "wr.s3.to_parquet(df, PATH, max_rows_by_file=2, dataset=True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Crawling the metadata and adding into Glue Catalog" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "({'col0': 'bigint', 'col1': 'string'}, None, None)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.s3.store_parquet_metadata(\n", " path=PATH,\n", " database=\"aws_sdk_pandas\",\n", " table=\"test\",\n", " dataset=True,\n", " mode=\"overwrite\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running the CTAS query to load the data into Redshift storage" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "con = wr.redshift.connect(connection=\"aws-sdk-pandas-redshift\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "query = \"CREATE TABLE public.test AS (SELECT * FROM aws_sdk_pandas_external.test)\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "with con.cursor() as cursor:\n", " cursor.execute(query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running an INSERT INTO query to load MORE data into Redshift storage" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\n", " \"col0\": [10, 11],\n", " \"col1\": [\"k\", \"l\"],\n", "})\n", "wr.s3.to_parquet(df, PATH, dataset=True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "query = \"INSERT INTO public.test (SELECT * FROM aws_sdk_pandas_external.test)\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "with con.cursor() as cursor:\n", " cursor.execute(query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking the result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "query = \"SELECT * FROM public.test\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col0col1
05f
11b
23d
36g
48i
510k
64e
70a
82c
97h
109j
1111l
\n", "
" ], "text/plain": [ " col0 col1\n", "0 5 f\n", "1 1 b\n", "2 3 d\n", "3 6 g\n", "4 8 i\n", "5 10 k\n", "6 4 e\n", "7 0 a\n", "8 2 c\n", "9 7 h\n", "10 9 j\n", "11 11 l" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.redshift.read_sql_table(con=con, schema=\"public\", table=\"test\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "con.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }