U ²Dx`ºÔã@s UdZddlZddlZddlmZmZmZmZmZm Z m Z ddlZddlZddl ZddlZddlZddlmZddlmZddlmZmZmZddlmZe e¡Zejed<d d ddgZ ee!ed <ddgZ"ee!ed<ej#ddœdd„Z$ej%ee!e!ddœdd„Z&ej%e!e!ee!dœdd„Z'ej%ee!e!e(dœdd„Z)dUee!ee!ee!ee!eej*e!dœdd„Z+dVej%e!e!e(ee!ee!ee!ee!ee!ee!ddœdd „Z,dWej%e!e!e!eee!dd!œd"d#„Z-ee!e!fe!ee!e!eee!dd$œd%d&„Z.ee e!ee!fe/eee!e/fe0ee!ee!e(eej*eee!e!fee!e!fd'œ d(d)„Z1dXeej2ee e!ee!fej%e!e!e!e(eee!e!fe!e!ee!eee!eee!e/eee!e/fe0ee!ee!e(eej*eee!e!fe e!ee!fd,œd-d.„Z3e!e(e(eee!e e(e/feej*eee!e!feej2d/œd0d1„Z4dYee!ee!ee!ee!eej*e(ee/e/e(ej#d3œ d4d5„Z5dZe!e!ee!e/e(eee!eej*e(ee/e/e(ej#d7œd8d9„Z6d[e!ej#ee e!ee!fee eee ed:feeeffee/eee!ej7fe(e ej2eej2fd;œdœ d?d@„Z9ed]ej2ej#e!e!e!e(eee!e!fe!ee!e!eee!eee!e/eee!e/fe(e/ddEœdFdG„ƒZ:d^e!e!ej#ee!ee!ee!ee!ee!ee0ee!e(e(eee!eej*ddHœdIdJ„Z;d_e!e!ej#ee!ee!ee!ee!ee!ee0ee!eee!e e(e/fe(e(eej*eee!e!fe ej2eej2fdKœdLdM„ZdS)bzAmazon Redshift Module.éN)ÚAnyÚDictÚIteratorÚListÚOptionalÚTupleÚUnion)Ú_data_types)Ú _databases)Ú_utilsÚ exceptionsÚs3)Ú apply_configsÚ_loggerÚAUTOZEVENÚALLÚKEYÚ_RS_DISTSTYLESÚCOMPOUNDZINTERLEAVEDÚ_RS_SORTSTYLES)ÚconÚreturncCst|tjƒst d¡‚dS)NzÅInvalid 'conn' argument, please pass a redshift_connector.Connection object. Use redshift_connector.connect() to use credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog.)Ú isinstanceÚredshift_connectorÚ ConnectionrZInvalidConnection©r©rú;/tmp/pip-target-zr53vnty/lib/python/awswrangler/redshift.pyÚ_validate_connectionsÿr)ÚcursorÚschemaÚtablercCs@|rd|›dnd}d|›d|›d}t d|¡| |¡dS)Nú"ú".ÚzDROP TABLE IF EXISTS zDrop table query: %s)rÚdebugÚexecute)rr r!Ú schema_strÚsqlrrrÚ_drop_table!sr)cCsT| d|›d|›d¡| ¡dd}| d¡d d¡ d¡}d d „|Dƒ}|S)Nz4SELECT indexdef FROM pg_indexes WHERE schemaname = 'z' AND tablename = 'ú'rú(éú)ú,cSsg|]}| ¡ d¡‘qS)r")Ústrip)Ú.0ÚfieldrrrÚ ,sz%_get_primary_keys..)r&ÚfetchallÚsplitr/)rr r!ÚresultZrfieldsÚfieldsrrrÚ_get_primary_keys(s r7cCs<|rd|›dnd}| d|›d|›d¡t| ¡ƒdkS)NzTABLE_SCHEMA = 'z' ANDr$zHSELECT true WHERE EXISTS (SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE z TABLE_NAME = 'z');r)r&Úlenr3)rr r!r'rrrÚ_does_table_exist0s ÿr9)Úaws_access_key_idÚaws_secret_access_keyÚaws_session_tokenÚiam_roleÚ boto3_sessionrcCs¼|dk r<|dk rzŽOne of IAM Role or AWS ACCESS_KEY_ID and SECRET_ACCESS_KEY must be given. Unable to find ACCESS_KEY_ID and SECRET_ACCESS_KEY in boto3 session.) rr%rZget_credentials_from_sessionZ access_keyZ secret_keyrÚInvalidArgumentÚtoken)r:r;r<r=r>Úauth_strÚcredentialsrrrÚ_make_s3_auth_string;s ÿ rD)rÚpathr!Úserialize_to_jsonr=r:r;r<r>r rc Cs|| dkrd|›d} nd| ›d|›d} t|||||d}|rBdnd}d| ›d|›d|›d |›} t d | ¡| | ¡dS)Nr"ú"."©r=r:r;r<r>z SERIALIZETOJSONr$zCOPY z FROM 'z' z FORMAT AS PARQUETzcopy query: %s)rDrr%r&)rrEr!rFr=r:r;r<r>r Z table_namerBZser_json_strr(rrrÚ_copyZsûrI)rr!Ú temp_tabler Úprimary_keysrcs¾|st|||d}t d|¡|s*t ¡‚|›d|›d‰d ‡fdd„|Dƒ¡}d|›d |›d |›d|›}t |¡| |¡d|›d |›d|›}t |¡| |¡t|||ddS)N©rr r!zprimary_keys: %sz.%s = z.%sz AND csg|]}ˆ||f‘qSrr)r0Úpk©Z equals_clauserrr2…sz_upsert..z DELETE FROM "rGz" USING z WHERE úINSERT INTO Ú.z SELECT * FROM )r7rr%rZInvalidRedshiftPrimaryKeysÚjoinr&r))rr!rJr rKZjoin_clauser(rrNrÚ_upsertxs rR)Úredshift_typesÚ diststyleÚdistkeyÚ sortstyleÚsortkeyrcCsÖ|tkrt dt›¡‚t| ¡ƒ}t d|¡|dkrF|sFt d¡‚|rj||krjt d|›d|›d¡‚|r†|tkr†t dt›¡‚|rÒt |tƒsªt d |›d |›¡‚|D]"}||kr®t d |›d |›¡‚q®dS)Nzdiststyle must be in zRedshift columns: %srz:You must pass a distkey if you intend to use KEY diststylez distkey (z) must be in the columns list: r-zsortstyle must be in z5sortkey must be a List of items in the columns list: z. Currently value: )rrZInvalidRedshiftDiststyleÚlistÚkeysrr%ZInvalidRedshiftDistkeyrZInvalidRedshiftSortstylerZInvalidRedshiftSortkey)rSrTrUrVrWÚcolsÚkeyrrrÚ_validate_parameterss( ÿÿr\) rEÚvarchar_lengths_defaultÚvarchar_lengthsÚparquet_infer_samplingÚpath_suffixÚpath_ignore_suffixÚuse_threadsr>Ús3_additional_kwargsrc Cs|dkrin|} tj|d} t d¡tj||||d|| |d\}}t d|¡i} | ¡D].\}}|| krt| |n|}tj||d| |<q\| S)z4Extract Redshift data types from a Pandas DataFrame.N©Úsessionz!Scanning parquet schemas on s3...F)rEZsamplingr`raÚdatasetrbr>rczathena_types: %s)ÚdtypeZvarchar_length) rÚensure_sessionrr%r Zread_parquet_metadataÚitemsr Zathena2redshift)rEr]r^r_r`rarbr>rcZ_varchar_lengthsreZathena_typesÚ_rSZcol_nameZcol_typeÚlengthrrrÚ_redshift_types_from_path¬s& ø rlçð?T)ÚdfrErr!r ÚmodeÚindexrgrTrVrUrWrKr]r^r_r`rarbr>rcrcCsÆ|dkrt|||dnjt|||ddkr‚|dkrzt ¡j}d|›}d|›d|›d|›d }t |¡| |¡|dfS||fS|rŽ| ¡nd }| rž| ¡nd} |dk rÄt j |||| |t jd}n,|dk rèt|| |||||||d }nt dƒ‚t||| | |dd dd„| ¡Dƒ¡dd…}|r:dd |¡›dnd}| rZ|dkrZd| ›dnd}|r|d| ›dd |¡›dnd}d|›d|›d|›|›d|›|›|›}t d|¡| |¡||fS) NÚ overwriterLTZupsertZtemp_redshift_zCREATE TEMPORARY TABLE z (LIKE "rGz")rr)rnrprgr]r^Zconverter_func) rEr]r^r_r`rarbr>rcz0df and path are None.You MUST pass at least one.)rSrTrUrVrWr$cSs g|]\}}|›d|›d‘qS)ú z, r)r0ÚkÚvrrrr2sz!_create_table..éþÿÿÿz, PRIMARY KEY (ú, r-rz DISTKEY(Ú z SORTKEY(r.zCREATE TABLE IF NOT EXISTS "z" ( z) DISTSTYLE zCreate table query: %s)r)r9ÚuuidÚuuid4Úhexrr%r&Úupperr Zdatabase_types_from_pandasZpyarrow2redshiftrlÚ ValueErrorr\rQri)rnrErr!r rorprgrTrVrUrWrKr]r^r_r`rarbr>rcZguidrJr(rSZcols_strZprimary_keys_strZdistkey_strZsortkey_strrrrÚ _create_tableÍsf ú÷û "&ÿ r})rEÚ keep_filesrbÚ categoriesÚchunkedr>rcrc cs@tj|||d|||d}|EdH|dkrrc©rErbr>rc)r Úread_parquetÚdelete_objects)rEr~rbrr€r>rcÚdfsrrrÚ_read_parquet_iterator s" ù ÿr†éè) Ú connectionÚ secret_idÚ catalog_idÚdbnamer>ÚsslÚtimeoutÚmax_prepared_statementsÚ tcp_keepaliverc Cs\tj|||||d} | jdkr2t d| j›d¡‚tj| j| j| j t | jƒ| j||||d S)aL Return a redshift_connector connection from a Glue Catalog or Secret Manager. Note ---- You MUST pass a `connection` OR `secret_id` https://github.com/aws/amazon-redshift-python-driver Parameters ---------- connection : Optional[str] Glue Catalog Connection name. secret_id: Optional[str]: Specifies the secret containing the version that you want to retrieve. You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret. catalog_id : str, optional The ID of the Data Catalog. If none is provided, the AWS account ID is used by default. dbname: Optional[str] Optional database name to overwrite the stored one. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. ssl: bool This governs SSL encryption for TCP/IP sockets. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver timeout: Optional[int] This is the time in seconds before the connection to the server will time out. The default is None which means no timeout. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver max_prepared_statements: int This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver tcp_keepalive: bool If True then use TCP keepalive. The default is True. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver Returns ------- redshift_connector.Connection redshift_connector connection. Examples -------- Fetching Redshift connection from Glue Catalog >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> with con.cursor() as cursor: >>> cursor.execute("SELECT 1") >>> print(cursor.fetchall()) >>> con.close() Fetching Redshift connection from Secrets Manager >>> import awswrangler as wr >>> con = wr.redshift.connect(secret_id="MY_SECRET") >>> with con.cursor() as cursor: >>> cursor.execute("SELECT 1") >>> print(cursor.fetchall()) >>> con.close() )rˆr‰rŠr‹r>ÚredshiftzInvalid connection type (z$. It must be a redshift connection.)) ÚuserÚdatabaseÚpasswordÚportÚhostrŒrrŽr) Ú _db_utilsZget_connection_attributesÚkindrZInvalidDatabaseTyperÚconnectr‘r’r“Úintr”r•) rˆr‰rŠr‹r>rŒrrŽrÚattrsrrrr˜9s,Mÿ ÿ÷r˜é„)Úcluster_identifierr‘r’ÚdurationÚauto_createÚ db_groupsr>rŒrrŽrrcCs”tjd|d}||||dœ}|dk r.||d<ng}|jf|Ž} |j|ddd}tj| d |rd|n|d | d|dd |dd||| | |d S)a Return a redshift_connector temporary connection (No password required). https://github.com/aws/amazon-redshift-python-driver Parameters ---------- cluster_identifier : str The unique identifier of a cluster. This parameter is case sensitive. user : str, optional The name of a database user. database : str, optional Database name. If None, the default Database is used. duration : int, optional The number of seconds until the returned temporary password expires. Constraint: minimum 900, maximum 3600. Default: 900 auto_create : bool Create a database user with the name specified for the user named in user if one does not exist. db_groups : List[str], optional A list of the names of existing database groups that the user named in user will join for the current session, in addition to any group memberships for an existing user. If not specified, a new user is added only to PUBLIC. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. ssl: bool This governs SSL encryption for TCP/IP sockets. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver timeout: Optional[int] This is the time in seconds before the connection to the server will time out. The default is None which means no timeout. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver max_prepared_statements: int This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver tcp_keepalive: bool If True then use TCP keepalive. The default is True. This parameter is forward to redshift_connector. https://github.com/aws/amazon-redshift-python-driver Returns ------- redshift_connector.Connection redshift_connector connection. Examples -------- >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> with con.cursor() as cursor: >>> cursor.execute("SELECT 1") >>> print(cursor.fetchall()) >>> con.close() r)Úservice_namere)ÚDbUserÚClusterIdentifierZDurationSecondsZ AutoCreateNZDbGroups)r¢ZClustersrr¡ZDBNameZ DbPasswordZEndpointZPortZAddress) r‘r’r“r”r•rŒrrŽrrŸ)rÚclientZget_cluster_credentialsZdescribe_clustersrr˜)rœr‘r’rržrŸr>rŒrrŽrZclient_redshiftÚargsÚresZclusterrrrÚconnect_tempšs.Eü ör¦.)r(rÚ index_colÚparamsÚ chunksizergÚsaferc Cs"t|dtj|||||||dS)uŽReturn a DataFrame corresponding to the result set of the query string. Note ---- For large extractions (1K+ rows) consider the function **wr.redshift.unload()**. Parameters ---------- sql : str SQL query. con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. index_col : Union[str, List[str]], optional Column(s) to set as index(MultiIndex). params : Union[List, Tuple, Dict], optional List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249â€™s paramstyle, is supported. chunksize : int, optional If specified, return an iterator where chunksize is the number of rows to include in each chunk. dtype : Dict[str, pyarrow.DataType], optional Specifying the datatype for columns. The keys should be the column names and the values should be the PyArrow types. safe : bool Check for overflows or other unsafe data type conversions. Returns ------- Union[pandas.DataFrame, Iterator[pandas.DataFrame]] Result as Pandas DataFrame(s). Examples -------- Reading from Redshift using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> df = wr.redshift.read_sql_query( ... sql="SELECT * FROM public.my_table", ... con=con ... ) >>> con.close() r©r(rr§r¨r©rgrª)rr–Úread_sql_queryr«rrrr¬ús7 ÿr¬) r!rr r§r¨r©rgrªrc Cs<|dkrd|›dnd|›d|›d}t|||||||dS)uReturn a DataFrame corresponding the table. Note ---- For large extractions (1K+ rows) consider the function **wr.redshift.unload()**. Parameters ---------- table : str Table name. con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. schema : str, optional Name of SQL schema in database to query (if database flavor supports this). Uses default schema if None (default). index_col : Union[str, List[str]], optional Column(s) to set as index(MultiIndex). params : Union[List, Tuple, Dict], optional List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249â€™s paramstyle, is supported. chunksize : int, optional If specified, return an iterator where chunksize is the number of rows to include in each chunk. dtype : Dict[str, pyarrow.DataType], optional Specifying the datatype for columns. The keys should be the column names and the values should be the PyArrow types. safe : bool Check for overflows or other unsafe data type conversions. Returns ------- Union[pandas.DataFrame, Iterator[pandas.DataFrame]] Result as Pandas DataFrame(s). Examples -------- Reading from Redshift using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> df = wr.redshift.read_sql_table( ... table="my_table", ... schema="public", ... con=con ... ) >>> con.close() NzSELECT * FROM "r"rGr«)r¬) r!rr r§r¨r©rgrªr(rrrÚread_sql_table7s<&ÿrÚappendFééÈ)rnrr!r rorprgrTrUrVrWrKr]r^Úuse_column_namesr©rcCs|jdkrt ¡‚t|d|j}d|_zXz| ¡}t|d|||||||| || ||| d\}}|r||j|jj ddd dgt|jƒ¡}|r¢d |›d nd}d}|rÂdd |j¡›d }t j|||d}|D]>\}}d|›d |›d|›d|›}t d|¡| ||f¡qÖ||kr2t|||||d| ¡W5QRXWn8tk r~}z| ¡t |¡‚W5d}~XYnXW5||_XdS)aC Write records stored in a DataFrame into Redshift. Note ---- For large DataFrames (1K+ rows) consider the function **wr.redshift.copy()**. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name mode : str Append, overwrite or upsert. index : bool True to store the DataFrame index as a column in the table, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Redshift types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'VARCHAR(10)', 'col2 name': 'FLOAT'}) diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_column_names: bool If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query. E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be inserted into the database columns `col1` and `col3`. chunksize: int Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query. Returns ------- None None. Examples -------- Writing to Redshift using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.redshift.to_sql( ... df=df, ... table="my_table", ... schema="public", ... con=con ... ) >>> con.close() TrFN)rnrErr!r rorprgrTrVrUrWrKr]r^)ÚlevelZinplacervz%sr"r#r$r+r-)rnÚcolumn_placeholdersr©rOz" z VALUES zsql: %s©rr r!rJrK)ÚemptyrZEmptyDataFramerÚ autocommitrr}Zreset_indexrpÚnamesrQr8Úcolumnsr–Z$generate_placeholder_parameter_pairsrr%ZexecutemanyrRÚcommitÚ ExceptionÚrollbackÚerror)rnrr!r rorprgrTrUrVrWrKr]r^r±r©Úautocommit_temprÚ created_tableÚcreated_schemar³r'Zinsertion_columnsZ$placeholder_parameter_pair_generatorZplaceholdersÚ parametersr(ÚexrrrÚto_sqlys`Y ñ ÿ rÂ)r(rErr=r:r;r<ÚregionÚ max_file_sizeÚ kms_key_idÚmanifestrbÚpartition_colsr>rcCs| d¡r|n|›d}tj| d}tj|||d| ¡Î}|rTdd |¡›dnd}| dkrdd nd}|d k r|d|›dnd}|d k r”d |›dnd}| d k r¬d| ›dnd}t||||| d}d|›d|›d|›d|›|›|›|›|›d }t d|¡| |¡W5QRXd S)aˆ Unload Parquet files on s3 from a Redshift query result (Through the UNLOAD command). https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. iam_role : str, optional AWS IAM role with the related permissions. aws_access_key_id : str, optional The access key for your AWS account. aws_secret_access_key : str, optional The secret key for your AWS account. aws_session_token : str, optional The session key for your AWS account. This is only needed when you are using temporary credentials. region : str, optional Specifies the AWS Region where the target Amazon S3 bucket is located. REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the same AWS Region as the Amazon Redshift cluster. By default, UNLOAD assumes that the target Amazon S3 bucket is located in the same AWS Region as the Amazon Redshift cluster. max_file_size : float, optional Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default maximum file size is 6200.0 MB. kms_key_id : str, optional Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be used to encrypt data files on Amazon S3. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. manifest : bool Unload a manifest file on S3. partition_cols: List[str], optional Specifies the partition keys for the unload operation. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None Examples -------- >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.redshift.unload_to_files( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=con, ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) >>> con.close() ú/rd)rErbr>z PARTITION BY (r.r-r$Tz manifestNz REGION AS 'r*z MAXFILESIZE AS z MBz KMS_KEY_ID 'rHz UNLOAD ('z') TO 'r?z3ALLOWOVERWRITE PARALLEL ON FORMAT PARQUET ENCRYPTEDú;zsql: %s)Úendswithrrhr r„rrQrDrr%r&)r(rErr=r:r;r<rÃrÄrÅrÆrbrÇr>rerZ partition_strZmanifest_strZ region_strZmax_file_size_strZkms_key_id_strrBrrrÚunload_to_filess(S û ,ÿrË)r(rErr=r:r;r<rÃrÄrÅrr€r~rbr>rcrcCs‚tj|d}t|||||||||| d| |d |dkrltj|| |d| ||d}|dkrhtj|| ||d|St|| || |||dS)aLoad Pandas DataFrame from a Amazon Redshift query result using Parquet files on s3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.redshift.read_sql_query()`/`wr.redshift.read_sql_table()` to extract large Amazon Redshift data into a Pandas DataFrames through the **UNLOAD command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.redshift.read_sql_query()`/`wr.redshift.read_sql_table()` function, so it is only recommended to fetch 1k+ rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. iam_role : str, optional AWS IAM role with the related permissions. aws_access_key_id : str, optional The access key for your AWS account. aws_secret_access_key : str, optional The secret key for your AWS account. aws_session_token : str, optional The session key for your AWS account. This is only needed when you are using temporary credentials. region : str, optional Specifies the AWS Region where the target Amazon S3 bucket is located. REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the same AWS Region as the Amazon Redshift cluster. By default, UNLOAD assumes that the target Amazon S3 bucket is located in the same AWS Region as the Amazon Redshift cluster. max_file_size : float, optional Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default maximum file size is 6200.0 MB. kms_key_id : str, optional Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be used to encrypt data files on Amazon S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. keep_files : bool Should keep stage files? chunked : Union[int, bool] If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Iterator[pandas.DataFrame]] Result as Pandas DataFrame(s). Examples -------- >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> df = wr.redshift.unload( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=con, ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) >>> con.close() rdF) r(rErr=r:r;r<rÃrÄrÅrÆrbr>rr‚)rErr€rbr>rcr~)rrhrËr rƒr„r†)r(rErr=r:r;r<rÃrÄrÅrr€r~rbr>rcrernrrrÚunloadzsVtóù ÿùrÌ)rErr!r r=r:r;r<r_rorTrUrVrWrKr]r^rFr`rarbr>rcrcCsâ|j}d|_zÈzŒ| ¡z}td|||||||| | ||| |||dd|||d\}}t||||||||||d ||kr†t|||||d| ¡W5QRXWn6tk rÐ}z| ¡t |¡‚W5d}~XYnXW5||_XdS)a|Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : str S3 prefix (e.g. s3://bucket/prefix/) con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name iam_role : str, optional AWS IAM role with the related permissions. aws_access_key_id : str, optional The access key for your AWS account. aws_secret_access_key : str, optional The secret key for your AWS account. aws_session_token : str, optional The session key for your AWS account. This is only needed when you are using temporary credentials. parquet_infer_sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). serialize_to_json : bool Should Wrangler add SERIALIZETOJSON parameter into the COPY command? SERIALIZETOJSON is necessary to load nested data https://docs.aws.amazon.com/redshift/latest/dg/ingest-super.html#copy_json path_suffix: Union[str, List[str], None] Suffix or List of suffixes to be scanned on s3 for the schema extraction (e.g. [".gz.parquet", ".snappy.parquet"]). Only has effect during the table creation. If None, will try to read all files. (default) path_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored during the schema extraction. (e.g. [".csv", "_SUCCESS"]). Only has effect during the table creation. If None, will try to read all files. (default) use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging", "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.db.copy_from_files( ... path="s3://bucket/my_parquet_files/", ... con=con, ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) >>> con.close() FN)rnrEr_r`rarr!r rorTrVrUrWrKr]r^rprgrbr>rc) rrEr!r r=r:r;r<r>rFr´) r¶rr}rIrRr¹rºr»rr¼)rErr!r r=r:r;r<r_rorTrUrVrWrKr]r^rFr`rarbr>rcr½rr¾r¿rÁrrrÚcopy_from_filess^ ë ö rÍé€–˜)rnrErr!r r=r:r;r<rprgrorTrUrVrWrKr]r^rFr~rbr>rcÚmax_rows_by_filercCsÔ| d¡r|dd…n|}| d¡r(|n|›d}tj|d}tj|||dr`t d|›d¡‚zRtj||| dd| ||||d t ||||||||||| |||||||||dW5|d krÎtj||||d XdS)aéLoad Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.redshift.to_sql()` to load large DataFrames into Amazon Redshift through the ** SQL COPY command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.redshift.to_sql()` function, so it is only recommended to inserting +1K rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame. path : str S3 path to write stage files (e.g. s3://bucket_name/any_name/). Note: This path must be empty. con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name iam_role : str, optional AWS IAM role with the related permissions. aws_access_key_id : str, optional The access key for your AWS account. aws_secret_access_key : str, optional The secret key for your AWS account. aws_session_token : str, optional The session key for your AWS account. This is only needed when you are using temporary credentials. index : bool True to store the DataFrame index in file, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. Only takes effect if dataset=True. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). keep_files : bool Should keep stage files? use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging", "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} max_rows_by_file : int Max number of rows in each file. Default is None i.e. dont split the files. (e.g. 33554432, 268435456) Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.db.copy( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path="s3://bucket/my_parquet_files/", ... con=con, ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) >>> con.close() Ú*NéÿÿÿÿrÈrd)rEr>rczThe received S3 path (zk) is not empty. Please, provide a different path or use wr.s3.delete_objects() to clean up the current one.Fr‚Tr®) rnrErprfrorgrbr>rcrÏ)rErr!r r=r:r;r<rorTrUrVrWrKr]r^rFrbr>rc) rÊrrhr Zlist_objectsrr@r„Z to_parquetrÍ)rnrErr!r r=r:r;r<rprgrorTrUrVrWrKr]r^rFr~rbr>rcrÏrerrrÚcopyÉsd ÿöì ÿrÒ)NNNNN)NNNNNN)N)rmNNTNN) NNNNNTNr‡T) Nr›TNNTNr‡T)NNNNT)NNNNNT)r®FNrNrNNr¯NFr°)NNNNNNNFTNN) NNNNNNNNFFTNN)NNNNrmr®rNrNNr¯NFNNTNN)NNNNFNr®rNrNNr¯NFFTNNrÎ)?Ú__doc__ÚloggingrxÚtypingrrrrrrrZboto3ZbotocoreZpandasÚpdZpyarrowÚparZawswranglerr r r–rrr Zawswrangler._configrÚ getLoggerÚ__name__rÚLoggerÚ__annotations__rÚstrrrrÚCursorr)r7Úboolr9ÚSessionrDrIrRr\r™ÚfloatrlZ DataFramer}r†r˜r¦ZDataTyper¬rrÂrËrÌrÍrÒrrrrÚsú$ ûú$öõ#û ú ú ö1ë êT ø÷ödõ ôcù$ø@ø$÷Bð ïò ñ{ð ï$é è8ç æ