In [None]:
# Install transformers, the library for Natural Language Understanding (NLU)
pip install transformers

In [25]:
import os
import requests
import json
from transformers import AutoTokenizer
from transformers import DistilBertForQuestionAnswering
from transformers import DistilBertTokenizer
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import DistilBertConfig
from transformers import Trainer
from transformers import TrainingArguments

In [8]:
model_name = 'distilbert-base-uncased-distilled-squad'
model = DistilBertForQuestionAnswering.from_pretrained(model_name)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [9]:
#test the model using nlp pipeline
context = "The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations, set up at the request of member governments. It was first established in 1988 by two United Nations organizations, the World Meteorological Organization (WMO) and the United Nations Environment Programme (UNEP), and later endorsed by the United Nations General Assembly through Resolution 43/53. Membership of the IPCC is open to all members of the WMO and UNEP. The IPCC produces reports that support the United Nations Framework Convention on Climate Change (UNFCCC), which is the main international treaty on climate change. The ultimate objective of the UNFCCC is to \"stabilize greenhouse gas concentrations in the atmosphere at a level that would prevent dangerous anthropogenic [i.e., human-induced] interference with the climate system\". IPCC reports cover \"the scientific, technical and socio-economic information relevant to understanding the scientific basis of risk of human-induced climate change, its potential impacts and options for adaptation and mitigation.\""
question = "What organization is the IPCC a part of?"

nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

nlp({
 'question': question,
 'context': context
})

{'score': 0.56325364112854,
 'start': 118,
 'end': 132,
 'answer': 'United Nations'}

In [3]:
# download the dataset
if not os.path.exists('squad'):
 os.mkdir('squad')

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

# loop through
for file in ['train-v2.0.json', 'dev-v2.0.json']:
 # make the request to download data over HTTP
 res = requests.get(f'{url}{file}')
 # write to file
 with open(f'./squad/{file}', 'wb') as f:
 for chunk in res.iter_content(chunk_size=4):
 f.write(chunk)

In [4]:
def read_squad(path):
 # open JSON file and load intro dictionary
 with open(path, 'rb') as f:
 squad_dict = json.load(f)

 # initialize lists for contexts, questions, and answers
 contexts = []
 questions = []
 answers = []
 # iterate through all data in squad data
 for group in squad_dict['data']:
 for passage in group['paragraphs']:
 context = passage['context']
 for qa in passage['qas']:
 question = qa['question']
 # check if we need to be extracting from 'answers' or 'plausible_answers'
 if 'plausible_answers' in qa.keys():
 access = 'plausible_answers'
 else:
 access = 'answers'
 for answer in qa[access]:
 # append data to lists
 contexts.append(context)
 questions.append(question)
 answers.append(answer)
 # return formatted data lists
 return contexts, questions, answers

In [5]:
# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json')

In [26]:
# get the character position at which the answer ends in the passage
def add_end_idx(answers, contexts):
 # loop through each answer-context pair
 for answer, context in zip(answers, contexts):
 # gold_text refers to the answer we are expecting to find in context
 gold_text = answer['text']
 # we already know the start index
 start_idx = answer['answer_start']
 # and ideally this would be the end index...
 end_idx = start_idx + len(gold_text)

 # ...however, sometimes squad answers are off by a character or two
 if context[start_idx:end_idx] == gold_text:
 # if the answer is not off :)
 answer['answer_end'] = end_idx
 else:
 # this means the answer is off by 1-2 tokens
 for n in [1, 2]:
 if context[start_idx-n:end_idx-n] == gold_text:
 answer['answer_start'] = start_idx - n
 answer['answer_end'] = end_idx - n

In [7]:
#get the character position at which the answer ends in the passage
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
#tokenize our context/question pairs.
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [12]:
# convert our character start/end positions to token start/end positions
def add_token_positions(encodings, answers):
 start_positions = []
 end_positions = []
 for i in range(len(answers)):
 start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
 end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

 # if start position is None, the answer passage has been truncated
 if start_positions[-1] is None:
 start_positions[-1] = tokenizer.model_max_length
 if end_positions[-1] is None:
 end_positions[-1] = tokenizer.model_max_length

 encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [13]:
# convert our character start/end positions to token start/end positions
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [14]:
import torch

class SquadDataset(torch.utils.data.Dataset):
 def __init__(self, encodings):
 self.encodings = encodings

 def __getitem__(self, idx):
 return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

 def __len__(self):
 return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [16]:
# The steps above prepared the datasets in the way that the trainer is expected. 
# Now all we need to do is create a model to fine-tune, define the TrainingArguments/TFTrainingArguments 
# and instantiate a Trainer/TFTrainer.
training_args = TrainingArguments(
 output_dir='./results', # output directory
 num_train_epochs=1, # total number of training epochs
 per_device_train_batch_size=16, # batch size per device during training
 per_device_eval_batch_size=64, # batch size for evaluation
 warmup_steps=10, # number of warmup steps for learning rate scheduler
 weight_decay=0.01, # strength of weight decay
 logging_dir='./logs', # directory for storing logs
 logging_steps=10
)

trainer = Trainer(
 model=model, # the instantiated transformers model to be trained
 args=training_args, # training arguments, defined above
 train_dataset=train_dataset, # training dataset
 eval_dataset=val_dataset # evaluation dataset
)

In [None]:
trainer.train()

In [18]:
# save the trained model locally 
torch.save(model.state_dict(), 'model.pth')

In [19]:
# convert the model.pth to tar.gz file
!tar -cvzf model.tar.gz model.pth

model.pth


In [20]:
# Upload model.tar.gz file to S3 bucket
from sagemaker.s3 import S3Uploader
model_artifact = S3Uploader.upload('model.tar.gz','s3://gkrish-sagemaker/model')
print(model_artifact)

s3://gkrish-sagemaker/model/model.tar.gz


In [22]:
# Create a pytorch model using the model file saved in S3
import sagemaker
from sagemaker.pytorch.model import PyTorchModel
bertmodel = PyTorchModel(entry_point='inference.py', 
 source_dir='scripts',
 model_data='s3://gkrish-sagemaker/model/model.tar.gz', 
 role=sagemaker.get_execution_role(), 
 framework_version='1.5', 
 py_version='py3')

In [23]:
# Deploy the model and create endpoint
predictor = bertmodel.deploy(initial_instance_count=1, 
 instance_type='ml.m5.xlarge')

-------------!