import boto3 import logging import json from html.parser import HTMLParser logger = logging.getLogger() logger.setLevel(logging.INFO) s3 = boto3.client('s3') article_tags = [] class MetaHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): global article_tags this_article_tags = [ attr[1] for attr in attrs if (tag == "meta") and (('property', 'article:tag') in attrs) and attr[0] == 'content' ] if len(this_article_tags) > 0: article_tags = article_tags + this_article_tags logger.info("Found article tags: %s" % str(article_tags)) parser = MetaHTMLParser() def lambda_handler(event, context): global article_tags logger.info("Received event: %s" % json.dumps(event)) s3Bucket = event.get("s3Bucket") s3ObjectKey = event.get("s3ObjectKey") metadata = event.get("metadata") documentBeforeCDE = s3.get_object(Bucket = s3Bucket, Key = s3ObjectKey) beforeCDE = documentBeforeCDE['Body'].read(); afterCDE = beforeCDE #Do Nothing for now new_key = 'cde_output/' + s3ObjectKey s3.put_object(Bucket = s3Bucket, Key = new_key, Body=afterCDE) article_tags = [] parser.feed(beforeCDE.decode("utf-8")) metaUL = [{ "name": "ARTICLE_TAGS", "value": { "stringListValue": article_tags } }] return { "version" : "v0", "s3ObjectKey": new_key, "metadataUpdates": metaUL }