In [3]:
!python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
!pip install neuronx-cc==2.* tensorflow-neuronx ipywidgets transformers


Writing to /root/.config/pip/pip.conf
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting ipywidgets
 Downloading ipywidgets-8.0.6-py3-none-any.whl (138 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting transformers
 Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Collecting widgetsnbextension~=4.0.7
 Downloading widgetsnbextension-4.0.7-py3-none-any.whl (2.1 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting jupyterlab-widgets~=3.0.7
 Downloading jupyterlab_widgets-3.0.7-py3-none-any.whl (198 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.2/198.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m

In [4]:
import torch
import torch_neuronx
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import transformers


def encode(tokenizer, *inputs, max_length=128, batch_size=1):
 tokens = tokenizer.encode_plus(
 *inputs,
 max_length=max_length,
 padding='max_length',
 truncation=True,
 return_tensors="pt"
 )
 return (
 torch.repeat_interleave(tokens['input_ids'], batch_size, 0),
 torch.repeat_interleave(tokens['attention_mask'], batch_size, 0),
 torch.repeat_interleave(tokens['token_type_ids'], batch_size, 0),
 )


# Create the tokenizer and model
name = "bert-base-cased-finetuned-mrpc"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name, torchscript=True)

# Set up some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

paraphrase = encode(tokenizer, sequence_0, sequence_2)
not_paraphrase = encode(tokenizer, sequence_0, sequence_1)

# Run the original PyTorch BERT model on CPU
cpu_paraphrase_logits = model(*paraphrase)[0]
cpu_not_paraphrase_logits = model(*not_paraphrase)[0]

# Compile the model for Neuron
model_neuron = torch_neuronx.trace(model, paraphrase)

# Save the TorchScript for inference deployment
filename = 'model.pt'
torch.jit.save(model_neuron, filename)

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 8.11kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 433/433 [00:00<00:00, 152kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 8.91MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 13.1MB/s]
Downloading pytorch_model.bin: 100%|██████████| 433M/433M [00:01<00:00, 237MB/s] 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
# Load the TorchScript compiled model
model_neuron = torch.jit.load(filename)

# Verify the TorchScript works on both example inputs
neuron_paraphrase_logits = model_neuron(*paraphrase)[0]
neuron_not_paraphrase_logits = model_neuron(*not_paraphrase)[0]

# Compare the results
print('CPU paraphrase logits: ', cpu_paraphrase_logits.detach().numpy())
print('Neuron paraphrase logits: ', neuron_paraphrase_logits.detach().numpy())
print('CPU not-paraphrase logits: ', cpu_not_paraphrase_logits.detach().numpy())
print('Neuron not-paraphrase logits: ', neuron_not_paraphrase_logits.detach().numpy())

CPU paraphrase logits: [[-0.34945598 1.9003887 ]]
Neuron paraphrase logits: [[-0.34909704 1.8992746 ]]
CPU not-paraphrase logits: [[ 0.5386365 -2.2197142]]
Neuron not-paraphrase logits: [[ 0.537705 -2.2180324]]


In [12]:
import time
import concurrent.futures
import numpy as np


def benchmark(filename, example, n_models=2, n_threads=2, batches_per_thread=10000):
 """
 Record performance statistics for a serialized model and its input example.

 Arguments:
 filename: The serialized torchscript model to load for benchmarking.
 example: An example model input.
 n_models: The number of models to load.
 n_threads: The number of simultaneous threads to execute inferences on.
 batches_per_thread: The number of example batches to run per thread.

 Returns:
 A dictionary of performance statistics.
 """

 # Load models
 models = [torch.jit.load(filename) for _ in range(n_models)]

 # Warmup
 for _ in range(8):
 for model in models:
 model(*example)

 latencies = []

 # Thread task
 def task(model):
 for _ in range(batches_per_thread):
 start = time.time()
 model(*example)
 finish = time.time()
 latencies.append((finish - start) * 1000)

 # Submit tasks
 begin = time.time()
 with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as pool:
 for i in range(n_threads):
 pool.submit(task, models[i % len(models)])
 end = time.time()

 # Compute metrics
 boundaries = [50, 95, 99]
 percentiles = {}

 for boundary in boundaries:
 name = f'latency_p{boundary}'
 percentiles[name] = np.percentile(latencies, boundary)
 duration = end - begin
 batch_size = 0
 for tensor in example:
 if batch_size == 0:
 batch_size = tensor.shape[0]
 inferences = len(latencies) * batch_size
 throughput = inferences / duration

 # Metrics
 metrics = {
 'filename': str(filename),
 'batch_size': batch_size,
 'batches': len(latencies),
 'inferences': inferences,
 'threads': n_threads,
 'models': n_models,
 'duration': duration,
 'throughput': throughput,
 **percentiles,
 }

 display(metrics)


def display(metrics):
 """
 Display the metrics produced by `benchmark` function.

 Args:
 metrics: A dictionary of performance statistics.
 """
 pad = max(map(len, metrics)) + 1
 for key, value in metrics.items():

 parts = key.split('_')
 parts = list(map(str.title, parts))
 title = ' '.join(parts) + ":"

 if isinstance(value, float):
 value = f'{value:0.3f}'

 print(f'{title :<{pad}} {value}')


# Benchmark BERT on Neuron
benchmark(filename, paraphrase)

Filename: model.pt
Batch Size: 1
Batches: 20000
Inferences: 20000
Threads: 2
Models: 2
Duration: 9.944
Throughput: 2011.203
Latency P50: 0.994
Latency P95: 1.017
Latency P99: 1.045
