import pandas as pd
import PyPDF2
import re
import requests
import json
import boto3
from IPython.display import display, HTML, IFrame
from bs4 import BeautifulSoup
pd.set_option('max_colwidth', 80) # Set max column width for displaying Pandas Dataframes
QNA_OUTPUT_STYLE = 'HTML'
def extract_pages(pdf_file, max_pages=100):
pages = []
with open(pdf_file, 'rb') as f:
for i, page in enumerate(PyPDF2.PdfReader(f).pages):
if i == max_pages:
break
pages.append(page.extract_text())
return pages
def download_url_text(url):
r = requests.get(url)
if r.status_code == 200:
return r.content.decode('utf-8')
else:
print(f'Failed to download {url}. Status code = {r.status_code}')
return None
def extract_paragraphs_from_html(text):
html = BeautifulSoup(text, 'html.parser')
return [ p.text for p in html.body.select('p') ]
newline, bold, unbold = '\n', '\033[1m', '\033[0m'
lightred, lightgreen, lightyellow, lightblue = '\033[91m', '\033[92m', '\033[93m', '\033[94m'
lightmagenta, lightcyan, reset = '\033[95m', '\033[96m', '\33[39m'
endpoint_name = ""
def query_endpoint_with_json_payload(encoded_json):
client = boto3.client('runtime.sagemaker')
response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/json', Body=encoded_json)
return response
def parse_response_multiple_texts(query_response):
model_predictions = json.loads(query_response['Body'].read())
generated_text = model_predictions['generated_texts']
return generated_text
def generate_text_from_prompt(prompt, max_length=300, max_time=50, temperature=0.5,
top_k=None, top_p=None, do_sample=True, seed=None):
payload = {
"text_inputs": prompt,
"max_length": max_length,
"max_time": max_time,
"temperature": temperature,
"do_sample": do_sample
}
if top_k is not None:
payload['top_k'] = top_k
if top_p is not None:
payload['top_p'] = top_p
if seed is not None:
payload['seed'] = seed
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
return parse_response_multiple_texts(query_response)[0]
def summarize(text, seed=None):
return generate_text_from_prompt(
f"""Summarize the following text in 100 words:\n\n{text}\n\nSummary:""",
temperature=0.2, # Low temperature for summarization
seed=seed
)
def ask(context, question, seed=None):
return generate_text_from_prompt(
f"""CONTEXT:\n{context}\n{question}""",
temperature=0.01, # Lowest temperature for accuracy
max_length=150, # Keep answers from being too verbose
seed=seed
)
def extract_question(text, seed=None):
return generate_text_from_prompt(
f"""EXTRACT QUESTIONS\nContext:\n{text}\nQuestion:""",
temperature=1.0, # Maximum temperature for creativity
seed=seed
)
def create_qna_pairs(text, n, output_style='HTML', seed=None):
questions = []
answers = []
for i in range(n):
qn = extract_question(text, seed) if i == 0 else extract_question(text)
questions.append(qn)
answers.append(ask(text, qn))
if output_style == 'HTML':
output = \
f"""{i+1}. Question: {questions[i]}
Answer: {answers[i]}"""
display(HTML(output))
elif output_style == 'text':
print(f"""{i+1}. {lightblue}{bold}Question{unbold}{reset}: {questions[i]} {lightcyan}{bold}Answer{unbold}{reset}: {answers[i]}""")
if output_style == 'table':
return pd.DataFrame({
'Question': questions,
'Answer': answers
}).drop_duplicates()