{ "AWSTemplateFormatVersion" : "2010-09-09", "Description" : "This CloudFormation template redshift_spectrum_analytics lab launches the redshift instances in the default VPC and assign the required roles for testing spectrum feature. It will also create Glue catalog database and use crawlers for infering schema from NY taxi data located in S3 bucket. You will be billed for the AWS resources used if you create a stack from this template", "Parameters" : { "ClusterType": { "Description": "The type of the cluster", "Type": "String", "Default": "single-node", "AllowedValues": [ "single-node", "multi-node" ], "ConstraintDescription" : "must be single-node or multi-node." }, "NumberOfNodes": { "Description": "The nuber of compute nodes in the redshift cluster. When cluster type is specified as: 1) single-node, the NumberOfNodes parameter should be specified as 1, 2) multi-node, the NumberOfNodes parameter should be greater than 1", "Type": "Number", "Default": "1" }, "NodeType": { "Description": "The node type to be provisioned for the redshift cluster", "Type": "String", "Default": "dc1.large", "AllowedValues" : [ "dc1.large", "ds2.xlarge"], "ConstraintDescription" : "must be a valid RedShift node type." }, "MasterUsername": { "Description": "The user name associated with the master user account for the redshift cluster that is being created", "Type": "String", "Default": "dbadmin", "AllowedPattern": "([a-z])([a-z]|[0-9])*", "NoEcho": "false", "ConstraintDescription" : "must start with a-z and contain only a-z or 0-9." }, "MasterUserPassword": { "Description": "The password associated with the master user account for the redshift cluster that is being created. ", "Type": "String", "NoEcho": "true", "MinLength": "1", "MaxLength": "41", "AllowedPattern" : "[a-zA-Z0-9]*", "ConstraintDescription" : "must contain only alphanumeric characters." }, "DatabaseName": { "Description": "The name of the database to be created when the redshift cluster is created", "Type": "String", "Default": "taxidb", "AllowedPattern": "([a-z]|[0-9])+", "ConstraintDescription" : "must contain a-z or 0-9 only." }, "GlueCatalogDBName": { "Description": "The DB name for Glue Catalog. You can create a external schema using this database in Redshift spectrum and query the tables that are created by the crawlers", "Type": "String", "MinLength": "4", "MaxLength": "255", "Default": "taxi-spectrum-db", "AllowedPattern" : "[a-zA-Z0-9_-]*" , "ConstraintDescription" : "must contain only alphanumeric characters." }, "CSVCrawler": { "Description": "Crawler Name for CSV File. This crawler will crawls the CSV file located in s3://us-west-2.serverless-analytics/NYC-transportation/taxi/ and infer the schema", "Type": "String", "MinLength": "4", "MaxLength": "255", "Default": "csvCrawler", "AllowedPattern" : "[a-zA-Z0-9_-]*" , "ConstraintDescription" : "must contain only alphanumeric characters." }, "ParquetCrawler": { "Description": "Crawler Name for Parquer files. This will crawls s3://us-west-2.serverless-analytics/canonical/NY-Pub files and infer the schema", "Type": "String", "MinLength": "4", "MaxLength": "255", "Default": "parquetCrawler", "AllowedPattern" : "[a-zA-Z0-9_-]*" , "ConstraintDescription" : "must contain only alphanumeric characters." }, "ClientIP" : { "Description" : "The IP address range that can be used to connect to the RedShift instance from your local machine either directly or via pgWeb.It must be a valid IP CIDR range of the form x.x.x.x/x.Pls get your address using checkip.amazonaws.com or whatsmyip.org", "Type": "String", "MinLength": "9", "MaxLength": "18", "Default": "0.0.0.0/0", "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", "ConstraintDescription": "It must be a valid IP CIDR range of the form x.x.x.x/x. Suggest to enable access to your IP address only. Pls get your address using checkip.amazonaws.com or whatsmyip.org." } }, "Metadata" : { "AWS::CloudFormation::Interface" : { "ParameterGroups" : [ { "Label" : { "default" : "Redshift Cluster Configuration" }, "Parameters" : [ "ClusterType", "NumberOfNodes","NodeType" ] }, { "Label" : { "default":"Redshift Database Configuration" }, "Parameters" : ["MasterUsername","MasterUserPassword", "DatabaseName" ] }, { "Label" : { "default" : "Enter IP address for the Redshift Security group configuration" }, "Parameters" : [ "ClientIP" ] }, { "Label" : { "default" : "Glue Crawler Configurations" }, "Parameters" : [ "GlueCatalogDBName", "CSVCrawler","ParquetCrawler" ] } ] } }, "Conditions": { "IsMultiNodeCluster": { "Fn::Equals": [ { "Ref": "ClusterType" }, "multi-node" ] } }, "Mappings" : { "AMIid" : { "us-east-1" : { "ver" : "ami-6057e21a" }, "us-east-2" : { "ver" : "ami-5ec1673e" }, "us-west-2" : { "ver" : "ami-32d8124a" }, "eu-west-1" : { "ver" : "ami-760aaa0f" } } }, "Resources" : { "RedshiftSpectrumRole" : { "Type" : "AWS::IAM::Role", "Properties" : { "ManagedPolicyArns" : ["arn:aws:iam::aws:policy/AmazonAthenaFullAccess","arn:aws:iam::aws:policy/AmazonS3FullAccess"], "AssumeRolePolicyDocument" : { "Version" : "2012-10-17", "Statement" : [ { "Effect" : "Allow", "Principal" : { "Service" : "redshift.amazonaws.com" }, "Action" : "sts:AssumeRole" } ] } } }, "RedshiftCluster" : { "Type" : "AWS::Redshift::Cluster", "Properties" : { "ClusterType" : { "Ref": "ClusterType" }, "NumberOfNodes": { "Fn::If": [ "IsMultiNodeCluster", { "Ref": "NumberOfNodes" }, { "Ref": "AWS::NoValue" } ] }, "NodeType": { "Ref": "NodeType" }, "DBName" : { "Ref" : "DatabaseName" }, "MasterUsername" : { "Ref" : "MasterUsername" }, "MasterUserPassword" : { "Ref" : "MasterUserPassword" }, "PubliclyAccessible" : "true", "VpcSecurityGroupIds" : [ { "Fn::GetAtt": [ "RedshiftSecurityGroup" , "GroupId" ] } ], "Port" : "5439", "IamRoles": [ { "Fn::GetAtt" : ["RedshiftSpectrumRole", "Arn"] } ] } }, "RedshiftSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", "Properties": { "GroupDescription": "This is the security group for Redshift", "SecurityGroupIngress" : [ { "IpProtocol" : "tcp", "FromPort" : "5439", "ToPort" : "5439", "CidrIp" : { "Ref" : "ClientIP"} }, { "IpProtocol" : "tcp", "FromPort" : "5439", "ToPort" : "5439", "CidrIp" : "172.31.0.0/16" } ] } }, "PGWebInstance": { "Type": "AWS::EC2::Instance", "Properties": { "ImageId": {"Fn::FindInMap" : [ "AMIid", { "Ref" : "AWS::Region" }, "ver" ]}, "InstanceType": "m3.medium", "SecurityGroups" : [ { "Ref": "PGWebSecurityGroup" } ], "Tags": [{ "Key": "Name", "Value": "PGWeb" }], "UserData": { "Fn::Base64": { "Fn::Join": [ "", [ "#!/bin/bash -xe\n", "wget https://github.com/sosedoff/pgweb/releases/download/v0.9.6/pgweb_linux_386.zip\n", "unzip pgweb_linux_386.zip\n", "/pgweb_linux_386 --bind 0.0.0.0 --listen 80", " --port ", { "Fn::GetAtt" : [ "RedshiftCluster", "Endpoint.Port" ] }, " --host ", { "Fn::GetAtt" : [ "RedshiftCluster", "Endpoint.Address" ] }, " --user ",{ "Ref" : "MasterUsername" }, " --pass ", { "Ref" : "MasterUserPassword" }, " --db taxidb &" ]]}} } }, "PGWebSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", "Properties": { "GroupDescription": "This is the security group to allow web access", "SecurityGroupIngress" : [ { "IpProtocol" : "tcp", "FromPort" : "80", "ToPort" : "80", "CidrIp" : { "Ref" : "ClientIP"} } ] } }, "GlueParseS3Role": { "Type": "AWS::IAM::Role", "Properties": { "ManagedPolicyArns" : ["arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"], "AssumeRolePolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": [ "glue.amazonaws.com" ] }, "Action": [ "sts:AssumeRole" ] } ] }, "Path": "/service-role/", "Policies": [ { "PolicyName": "Glue-access-taxi-data", "PolicyDocument": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "s3:GetObject", "s3:PutObject" ], "Resource": [ "arn:aws:s3:::us-west-2.serverless-analytics/NYC-transportation/taxi/*", "arn:aws:s3:::us-west-2.serverless-analytics/canonical/NY-Pub/*" ] } ] } }] } }, "GlueDatabase": { "Type": "AWS::Glue::Database", "Properties": { "CatalogId": { "Ref": "AWS::AccountId" }, "DatabaseInput": { "Name": { "Ref" : "GlueCatalogDBName" }, "Description": "Database used for Redshift Spectrum" } } }, "MyCrawler2": { "Type": "AWS::Glue::Crawler", "Properties": { "Name": { "Ref" : "ParquetCrawler" }, "Role": { "Fn::GetAtt": [ "GlueParseS3Role", "Arn"] }, "DatabaseName": {"Ref": "GlueDatabase"}, "Targets": { "S3Targets": [ { "Path": "s3://us-west-2.serverless-analytics/canonical/NY-Pub" } ] }, "SchemaChangePolicy": { "UpdateBehavior": "UPDATE_IN_DATABASE", "DeleteBehavior": "LOG" } } }, "MyCrawler1": { "Type": "AWS::Glue::Crawler", "Properties": { "Name": { "Ref" : "CSVCrawler" }, "Role": { "Fn::GetAtt": [ "GlueParseS3Role", "Arn"] }, "DatabaseName": { "Ref": "GlueDatabase" }, "Targets": { "S3Targets": [ { "Path": "s3://us-west-2.serverless-analytics/NYC-transportation/taxi/" } ] }, "SchemaChangePolicy": { "UpdateBehavior": "UPDATE_IN_DATABASE", "DeleteBehavior": "LOG" } } } }, "Outputs": { "redshiftHost" : { "Value": { "Fn::GetAtt" : [ "RedshiftCluster", "Endpoint.Address" ]} }, "redshiftPort" : { "Value": { "Fn::GetAtt" : [ "RedshiftCluster", "Endpoint.Port" ]} }, "redshiftUser" : { "Value": { "Ref" : "MasterUsername" } }, "redshiftDatabase" : { "Value": { "Ref" : "DatabaseName" } }, "redshiftEndpoint" : { "Value": { "Fn::Join": [ ":", [ { "Fn::GetAtt": [ "RedshiftCluster", "Endpoint.Address" ] }, { "Fn::GetAtt": [ "RedshiftCluster", "Endpoint.Port" ] } ] ] }}, "pgWeb" : { "Value": { "Fn::Join" : [ "", [ "http://", { "Fn::GetAtt" : [ "PGWebInstance", "PublicDnsName" ]} ]] } }, "redshiftIAMRole" : { "Value": { "Fn::GetAtt" : ["RedshiftSpectrumRole", "Arn"] } }, "GlueCatalogDBName" : { "Value": {"Ref" : "GlueCatalogDBName" } }, "CSVCrawlerName" : { "Value": { "Ref" : "CSVCrawler" } }, "ParquetCrawlerName" : { "Value": { "Ref" : "ParquetCrawler" } } } }