{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 9 - Redshift - Append, Overwrite and Upsert\n", "\n", "awswrangler's `copy/to_sql` function has three different `mode` options for Redshift.\n", "\n", "1 - `append`\n", "\n", "2 - `overwrite`\n", "\n", "3 - `upsert`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install the optional modules first\n", "!pip install 'awswrangler[redshift]'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import awswrangler as wr\n", "import pandas as pd\n", "from datetime import date\n", "\n", "con = wr.redshift.connect(\"aws-sdk-pandas-redshift\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ···········································\n" ] } ], "source": [ "import getpass\n", "bucket = getpass.getpass()\n", "path = f\"s3://{bucket}/stage/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your IAM ROLE ARN:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ····················································································\n" ] } ], "source": [ "iam_role = getpass.getpass()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating the table (Overwriting if it exists)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idvaluedate
02boo2020-01-02
11foo2020-01-01
\n", "
" ], "text/plain": [ " id value date\n", "0 2 boo 2020-01-02\n", "1 1 foo 2020-01-01" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " \"id\": [1, 2],\n", " \"value\": [\"foo\", \"boo\"],\n", " \"date\": [date(2020, 1, 1), date(2020, 1, 2)]\n", "})\n", "\n", "wr.redshift.copy(\n", " df=df,\n", " path=path,\n", " con=con,\n", " schema=\"public\",\n", " table=\"my_table\",\n", " mode=\"overwrite\",\n", " iam_role=iam_role,\n", " primary_keys=[\"id\"]\n", ")\n", "\n", "wr.redshift.read_sql_table(table=\"my_table\", schema=\"public\", con=con)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Appending" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idvaluedate
01foo2020-01-01
12boo2020-01-02
23bar2020-01-03
\n", "
" ], "text/plain": [ " id value date\n", "0 1 foo 2020-01-01\n", "1 2 boo 2020-01-02\n", "2 3 bar 2020-01-03" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " \"id\": [3],\n", " \"value\": [\"bar\"],\n", " \"date\": [date(2020, 1, 3)]\n", "})\n", "\n", "wr.redshift.copy(\n", " df=df,\n", " path=path,\n", " con=con,\n", " schema=\"public\",\n", " table=\"my_table\",\n", " mode=\"append\",\n", " iam_role=iam_role,\n", " primary_keys=[\"id\"]\n", ")\n", "\n", "wr.redshift.read_sql_table(table=\"my_table\", schema=\"public\", con=con)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Upserting" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idvaluedate
01foo2020-01-01
12xoo2020-01-02
23bar2020-01-03
\n", "
" ], "text/plain": [ " id value date\n", "0 1 foo 2020-01-01\n", "1 2 xoo 2020-01-02\n", "2 3 bar 2020-01-03" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " \"id\": [2, 3],\n", " \"value\": [\"xoo\", \"bar\"],\n", " \"date\": [date(2020, 1, 2), date(2020, 1, 3)]\n", "})\n", "\n", "wr.redshift.copy(\n", " df=df,\n", " path=path,\n", " con=con,\n", " schema=\"public\",\n", " table=\"my_table\",\n", " mode=\"upsert\",\n", " iam_role=iam_role,\n", " primary_keys=[\"id\"]\n", ")\n", "\n", "wr.redshift.read_sql_table(table=\"my_table\", schema=\"public\", con=con)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning Up" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "with con.cursor() as cursor:\n", " cursor.execute(\"DROP TABLE public.my_table\")\n", "con.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }