from __future__ import print_function from decimal import Decimal import csv import urllib, hashlib from bs4 import BeautifulSoup import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk.corpus import stopwords import zipfile import boto3, botocore import StringIO import pywren import os S3BUCKET = 'pywren-workshop' def news_analyzer(links): try: download_nltk_data() dynamo_tbl = boto3.resource('dynamodb', 'us-west-2').Table('pywren-workshop-gdelt-table') for link in links: text = scrape_content(link) sid = SentimentIntensityAnalyzer() sentiment = sid.polarity_scores(text) record = {} record['link'] = link record['sentiment'] = str(sentiment['compound']) words = [] for word, frequency in get_frequent_words(text): words.append(word + ':' + str(frequency)) record['words'] = words response = dynamo_tbl.put_item(Item=record) return 'Ok' except Exception as e: return e def download_nltk_data(): s3 = boto3.resource('s3') # insert IF check if NLTK data is available to speed up (container re-use) # FIX try: s3.Bucket('pywren-workshop').download_file('nltk_data.zip', '/tmp/nltk_data.zip') zip_ref = zipfile.ZipFile('/tmp/nltk_data.zip', 'r') zip_ref.extractall('/tmp/condaruntime/') zip_ref.close() except botocore.exceptions.ClientError as e: return e def scrape_content(link): try: html = urllib.urlopen(link).read() soup = BeautifulSoup(html, "html.parser") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() return text except IOError: return '' def get_frequent_words(text): # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) words = nltk.word_tokenize(text) # Remove single and double-character tokens (mostly punctuation) words = [word for word in words if len(word) > 2] # Remove numbers words = [word for word in words if not word.isnumeric()] # Lowercase all words (default_stopwords are lowercase too) words = [word.lower() for word in words] # Remove stopwords default_stopwords = set(nltk.corpus.stopwords.words('english')) words = [word for word in words if word not in default_stopwords] # Calculate frequency distribution fdist = nltk.FreqDist(words) return fdist.most_common(50) def get_urls_from_gdelt_data(file): s3 = boto3.client('s3', 'us-east-1') try: s3_object = s3.get_object(Bucket='gdelt-open-data', Key='events/' + file) f = StringIO.StringIO(s3_object['Body'].read().decode('utf-8','replace').encode('ascii','replace')) fieldnames = ['GLOBALEVENTID', 'SQLDATE', 'MonthYear', 'Year', 'FractionDate', 'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode', 'Actor1EthnicCode', 'Actor1Religion1Code', 'Actor1Religion2Code', 'Actor1Type1Code', 'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code', 'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode', 'Actor2EthnicCode', 'Actor2Religion1Code', 'Actor2Religion2Code', 'Actor2Type1Code', 'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_FullName', 'Actor1Geo_CountryCode', 'Actor1Geo_ADM1Code', 'Actor1Geo_Lat', 'Actor1Geo_Long', 'Actor1Geo_FeatureID', 'Actor2Geo_Type', 'Actor2Geo_FullName', 'Actor2Geo_CountryCode', 'Actor2Geo_ADM1Code', 'Actor2Geo_Lat', 'Actor2Geo_Long', 'Actor2Geo_FeatureID', 'ActionGeo_Type', 'ActionGeo_FullName', 'ActionGeo_CountryCode', 'ActionGeo_ADM1Code', 'ActionGeo_Lat', 'ActionGeo_Long', 'ActionGeo_FeatureID', 'DATEADDED', 'SOURCEURL'] items = csv.DictReader(f, fieldnames, delimiter='\t') links = [] for i, item in enumerate(items): links.append(item['SOURCEURL']) # remove duplicates in list of URLS links_without_duplicates = [] for link in links: if link not in links_without_duplicates: links_without_duplicates.append(link) # limit ourselves to 1000 articles return links_without_duplicates[:1000] except botocore.exceptions.ClientError as e: return e