import os import pytz import requests import re from datetime import datetime from datetime import timedelta import xml.etree.ElementTree as ET import time def clean_html(raw_html): reg1 = re.compile('<.*?>') reg2 = re.compile(' ') clean_text = re.sub(reg1, '', raw_html) cleaner_text = re.sub(reg2, '', clean_text) return cleaner_text def lambda_handler(event, context): # The below is done in such a way because it was easier for me to test. FEED_PAGES = {1: "", 2: "", 3: "", 4: "", 5: "", 6: "", 7: "", 8: "", 9: "", 10: "", 11: "", 12: "", 13: "", 14: "", 15: "", 16: "", 17: "", 18: "", 19: "", 20: "", 21: "", 22: "", 23: "", 24: "", 25: "", 26: "", 27: ""} POST_HEADERS = {"Content-Type": "application/json"} GET_HEADERS = {"Accept": "application/xml", "Content-Type": "application/xml"} ADDRESS = os.environ['WEBHOOK_URL'] LIST_OF_BLOGS = os.environ['BLOGS'] if LIST_OF_BLOGS == "0" or not LIST_OF_BLOGS: LIST_OF_BLOGS = "1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,27,28" blogs = LIST_OF_BLOGS.split(",") blog_urls = [] for num in blogs: if not num.isnumeric(): continue index = int(num.strip()) if index not in FEED_PAGES.keys(): continue blog_urls.append(FEED_PAGES[index]) blog_urls = set(blog_urls) for url in blog_urls: xml = requests.get(url, headers=GET_HEADERS) root = ET.fromstring(xml.text) for entry in root.iter('item'): published_datetime = datetime.strptime(entry.find( 'pubDate').text, '%a, %d %b %Y %H:%M:%S %z') yesterday_datetime = - timedelta(days=1) if published_datetime < yesterday_datetime: continue description = clean_html(entry.find('description').text) payload = "{\"Content\":\"BLOG\\n\\n" + entry.find( 'title').text + "\\n\\n" + entry.find( 'pubDate').text + "\\n\\n" + description + "\\n\\n" + entry.find( 'link').text + "\"}" response =, data=payload.encode('utf-8'), headers=POST_HEADERS) print(response.status_code) print() time.sleep(1) return "Done"