import argparse import os from typing import Tuple import collections import numpy as np import torch import intel_extension_for_pytorch as ipex from datasets import load_dataset, load_metric from torch.utils.data import DataLoader from tqdm.auto import tqdm from transformers import ( AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer, EvalPrediction, default_data_collator, BertTokenizer ) __MODEL_DICT__ = dict() __MODEL_FP32_DICT__ = dict() def parse_args(): parser = argparse.ArgumentParser( description="Finetune a transformers model on a Question Answering task" ) parser.add_argument( "--dataset_name", type=str, default="squad", help="The name of the dataset to use (via the datasets library).", ) parser.add_argument( "--max_seq_length", type=int, default=384, help=( "The maximum total input sequence length after tokenization. Sequences" " longer than this will be truncated, sequences shorter will be padded if" " `--pad_to_max_lengh` is passed." ), ) parser.add_argument( "--max_answer_length", type=int, default=30, help=( "The maximum length of an answer that can be generated. This is needed" " because the start and end predictions are not conditioned on one another." ), ) parser.add_argument( "--model_name_or_path", type=str, default="csarron/bert-base-uncased-squad-v1", help="Path to pretrained model or model identifier from huggingface.co/models.", required=False, ) parser.add_argument( "--batch_size", type=int, default=8, help="Batch size for the evaluation.", ) parser.add_argument( "--doc_stride", type=int, default=128, help=( "When splitting up a long document into chunks how much stride to take" " between chunks." ), ) parser.add_argument( "--n_best_size", type=int, default=20, help=( "The total number of n-best predictions to generate when looking for an" " answer." ), ) parser.add_argument( "--model_path", type=str, default="./", help="Where to store the quantized model.", ) args = parser.parse_args() return args def preprare_dataset(args): print("Prepraring dataset...") raw_datasets = load_dataset(args.dataset_name, None) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True) column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] pad_on_right = tokenizer.padding_side == "right" max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) tokenizer.save_pretrained(save_directory=args.model_path) def prepare_validation_features(examples): examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples eval_examples = raw_datasets["validation"] eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=True, desc="Running tokenizer on validation dataset", ) data_collator = default_data_collator eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"] ) eval_dataloader = DataLoader( eval_dataset_for_model, collate_fn=data_collator, batch_size=args.batch_size ) return answer_column_name, eval_examples, eval_dataset, eval_dataloader def TraceAndSave(args,model,eval_dataloader): model.eval() # Construct jit inputs based on example tensors jit_inputs = [] example_batch = next(iter(eval_dataloader)) for key in example_batch: example_tensor = torch.ones_like(example_batch[key]) jit_inputs.append(example_tensor) jit_inputs = tuple(jit_inputs) with torch.no_grad(): model = torch.jit.trace(model, jit_inputs, check_trace=False, strict=False) model = torch.jit.freeze(model) model.save(os.path.join(args.model_path, "model_fp32.pt")) def IPEX_quantize(args, model, eval_dataloader): model.eval() conf = ipex.quantization.QuantConf(qscheme=torch.per_tensor_affine) # Here we use dataset samples for calibations print("Doing calibration...") for step, batch in enumerate(eval_dataloader): print("Calibration step-", step) with torch.no_grad(): # conf will be updated with observed statistics during calibrating with the dataset with ipex.quantization.calibrate(conf): model(**batch) if step == 5: break # [Optional] You can save this calibration file for later use # conf.save('int8_conf.json') # Construct jit inputs based on example tensors jit_inputs = [] example_batch = next(iter(eval_dataloader)) for key in example_batch: example_tensor = torch.ones_like(example_batch[key]) jit_inputs.append(example_tensor) jit_inputs = tuple(jit_inputs) # Converting Quantization model: print("Doing model converting...") with torch.no_grad(): model = ipex.quantization.convert(model, conf, jit_inputs) # Two iterations to enable fusions with torch.no_grad(): model(**example_batch) model(**example_batch) # Save quantized model model.save(os.path.join(args.model_path, "model_int8.pt")) def model_fn(args, eval_dataloader): config = AutoConfig.from_pretrained( args.model_name_or_path,torchscript=True, return_dict=False, ) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path, config=config, ) # to save FP32 model TraceAndSave(args,model,eval_dataloader) # To generate the int8 quantized model and then save that next. IPEX_quantize(args, model, eval_dataloader) # model.save_pretrained(os.path.join(args.model_path, "model.pt")) model_path = os.path.join(args.model_path, "model_int8.pt") model = torch.jit.load(model_path) return model def postprocess_qa_predictions( examples, features, predictions: Tuple[np.ndarray, np.ndarray], n_best_size: int = 20, max_answer_length: int = 30, ): all_start_logits, all_end_logits = predictions example_id_to_index = {k: i for i, k in enumerate(examples["id"])} features_per_example = collections.defaultdict(list) for i, feature in enumerate(features): features_per_example[example_id_to_index[feature["example_id"]]].append(i) all_predictions = collections.OrderedDict() for example_index, example in enumerate(tqdm(examples)): feature_indices = features_per_example[example_index] min_null_prediction = None prelim_predictions = [] for feature_index in feature_indices: start_logits = all_start_logits[feature_index] end_logits = all_end_logits[feature_index] offset_mapping = features[feature_index]["offset_mapping"] token_is_max_context = features[feature_index].get( "token_is_max_context", None ) feature_null_score = start_logits[0] + end_logits[0] if ( min_null_prediction is None or min_null_prediction["score"] > feature_null_score ): min_null_prediction = { "offsets": (0, 0), "score": feature_null_score, "start_logit": start_logits[0], "end_logit": end_logits[0], } start_indexes = np.argsort(start_logits)[ -1 : -n_best_size - 1 : -1 ].tolist() end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: if ( start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or len(offset_mapping[start_index]) < 2 or offset_mapping[end_index] is None or len(offset_mapping[end_index]) < 2 ): continue if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue if ( token_is_max_context is not None and not token_is_max_context.get(str(start_index), False) ): continue prelim_predictions.append( { "offsets": ( offset_mapping[start_index][0], offset_mapping[end_index][1], ), "score": start_logits[start_index] + end_logits[end_index], "start_logit": start_logits[start_index], "end_logit": end_logits[end_index], } ) predictions = sorted( prelim_predictions, key=lambda x: x["score"], reverse=True )[:n_best_size] context = example["context"] for pred in predictions: offsets = pred.pop("offsets") pred["text"] = context[offsets[0] : offsets[1]] if len(predictions) == 0 or ( len(predictions) == 1 and predictions[0]["text"] == "" ): predictions.insert( 0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0} ) scores = np.array([pred.pop("score") for pred in predictions]) exp_scores = np.exp(scores - np.max(scores)) probs = exp_scores / exp_scores.sum() for prob, pred in zip(probs, predictions): pred["probability"] = prob all_predictions[example["id"]] = predictions[0]["text"] return all_predictions def model_fn_ep(model_dir): global __MODEL_DICT__ if __MODEL_DICT__: print("Model INT8 already loaded") else: print("Loading Model INT8") model_path = os.path.join(model_dir, 'model_int8.pt') model = torch.jit.load(model_path) model = model.to('cpu') tokenizer = BertTokenizer.from_pretrained("csarron/bert-base-uncased-squad-v1", use_fast=True) model_dict = {'model': model, 'tokenizer':tokenizer} __MODEL_DICT__ = model_dict return __MODEL_DICT__ def model_fn_ep_fp32(model_dir): global __MODEL_FP32_DICT__ if __MODEL_FP32_DICT__: print("Model FP32 already loaded") else: print("Loading Model FP32") model_path = os.path.join(model_dir, 'model_fp32.pt') model_fp32 = torch.jit.load(model_path) model_fp32 = model_fp32.to('cpu') tokenizer = BertTokenizer.from_pretrained("csarron/bert-base-uncased-squad-v1", use_fast=True) model_fp32_dict = {'model': model_fp32, 'tokenizer':tokenizer} __MODEL_FP32_DICT__ = model_fp32_dict return __MODEL_FP32_DICT__ def predict_fn_ep(model_dict, input_data, context): """ Apply model to the incoming request """ tokenizer = model_dict['tokenizer'] model = model_dict['model'] encoded_input = tokenizer.encode_plus(input_data, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) output=model(**encoded_input) # This is specific to Inferentia answer_text = str(output[0]) answer_start = torch.argmax(output[0]) answer_end = torch.argmax(output[1])+1 if (answer_end > answer_start): answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) else: answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) return answer_text; def predict_fn( args, model, answer_column_name, eval_examples, eval_dataset, eval_dataloader ): def post_processing_function(examples, features, predictions, stage="eval"): predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, ) formatted_predictions = [ {"id": k, "prediction_text": v} for k, v in predictions.items() ] references = [ {"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples ] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric("squad") def create_and_fill_np_array(start_or_end_logits, dataset, max_len): step = 0 logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64) for _, output_logit in enumerate(start_or_end_logits): batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step : step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[: len(dataset) - step] step += batch_size return logits_concat print("***** Running Evaluation *****") print(f" Num examples = {len(eval_dataset)}") print(f" Batch size = {args.batch_size}") all_start_logits = [] all_end_logits = [] for _, batch in enumerate(tqdm(eval_dataloader)): with torch.no_grad(): outputs = model(**batch) start_logits = outputs[0] end_logits = outputs[1] all_start_logits.append(start_logits.cpu().numpy()) all_end_logits.append(end_logits.cpu().numpy()) max_len = max([x.shape[1] for x in all_start_logits]) start_logits_concat = create_and_fill_np_array( all_start_logits, eval_dataset, max_len ) end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len) outputs_numpy = (start_logits_concat, end_logits_concat) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute( predictions=prediction.predictions, references=prediction.label_ids ) print(f"Evaluation metrics: {eval_metric}") def main(): args = parse_args() print("***** Running fine-tuned Bert-base inference for Question Answering task with IPEX quantization *****") # Preparing the datasets answer_column_name, eval_examples, eval_dataset, eval_dataloader = preprare_dataset( args ) # Preparing the ipex quantized model model = model_fn(args, eval_dataloader) # Doing model inference predict_fn( args, model, answer_column_name, eval_examples, eval_dataset, eval_dataloader ) print("***** Test End Point***") model_dict = model_fn_ep(args.model_path) context= ("The Panthers finished the regular season with a 15-1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP)." " They defeated the Arizona Cardinals 49-15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995." "The Broncos finished the regular season with a 12-4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20-18 in the AFC Championship Game." " They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl.") question = "Who denied Patriots?" #question = "How many appearances have the Denver Broncos made in the Super Bowl?" answer_text = predict_fn_ep(model_dict, question, context) print("Question:", question, "Answer:", answer_text) if __name__ == "__main__": main()