""" Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ """ This is an example dag orchestrating Glue Job. """ import os from datetime import datetime, timedelta from airflow import DAG from airflow.utils.dates import days_ago from airflow.operators.python_operator import PythonOperator from airflow.operators.bash import BashOperator from airflow.providers.amazon.aws.operators.s3_copy_object import S3CopyObjectOperator from airflow.providers.amazon.aws.operators.glue_crawler import AwsGlueCrawlerOperator import os from airflow.models import Variable from airflow.utils.task_group import TaskGroup from airflow.providers.amazon.aws.operators.glue import AwsGlueJobOperator # [START howto_operator_emr_eks_env_variables] import json RAW_ZONE_PREFIX = "rawzone" DATA_BUCKET = Variable.get("DATA_BUCKET") GLUE_SERVICE_ROLE_ARN = Variable.get("GLUE_SERVICE_ROLE_ARN") GLUE_SERVICE_ROLE_NAME = Variable.get("GLUE_SERVICE_ROLE_NAME") default_args = { 'owner': 'Airflow', 'depends_on_past': False, 'catchup': False, 'start_date': datetime(2022, 1, 1), 'retries': 3, 'retry_delay': timedelta(minutes=1) } config = { "Name": "noaa-weather-station-data", "Role":GLUE_SERVICE_ROLE_ARN, "DatabaseName": "default", "Targets": {"S3Targets":[{"Path":f"s3://{DATA_BUCKET}/{RAW_ZONE_PREFIX}/"}]}, "Configuration": "{\"Version\": 1.0,\"Grouping\": {\"TableGroupingPolicy\": \"CombineCompatibleSchemas\" } }" } with DAG( dag_id="noaa_weather_station_data", schedule_interval=None, default_args=default_args, ) as dag: YEARS=["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022"] # YEARS=["2015","2016"] get_weather_station_lookup_data = S3CopyObjectOperator( task_id="weather_station_lookup_data", source_bucket_key ='ghcnd-stations.txt', dest_bucket_key =f'{RAW_ZONE_PREFIX}/station_lookup.txt', source_bucket_name ='noaa-ghcn-pds', dest_bucket_name = '{{ var.value.DATA_BUCKET }}' ) with TaskGroup("weather_data", tooltip="Tasks for getting climatology data") as weather_data: for year in YEARS: ## This task will also increase the memory utilization because of the file size. get_weather_station_data = S3CopyObjectOperator( task_id=year, # source_bucket_key=f'csv.gz/{year}.csv.gz', Using csv so I can simulate auto scaling source_bucket_key=f'csv/by_year/{year}.csv', dest_bucket_key=f'{RAW_ZONE_PREFIX}/year={year}/data.csv', source_bucket_name='noaa-ghcn-pds', dest_bucket_name= '{{ var.value.DATA_BUCKET }}' ) glue_crawler = AwsGlueCrawlerOperator( task_id="create_weather_reading_table", config=config, ) glue_task = AwsGlueJobOperator( task_id="build_dataUS_parquet", job_name='noaa_weatherdata_transform', script_location=f"s3://{DATA_BUCKET}/scripts/noaa_weatherdata_transform.py", iam_role_name= GLUE_SERVICE_ROLE_NAME, concurrent_run_limit=2, retry_limit=1, s3_bucket=f"s3://{DATA_BUCKET}/logs/gluejob/", script_args={ '--DATA_BUCKET': DATA_BUCKET, '--YEARS': json.dumps(YEARS) }, create_job_kwargs= { "GlueVersion":"3.0" } ) get_weather_station_lookup_data >> glue_crawler weather_data >> glue_crawler glue_crawler >> glue_task