import numpy as np import sys import os import json from pathlib import Path import logging import torch import triton_python_backend_utils as pb_utils # This code will be loaded on each worker separately.. logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) class TritonPythonModel: # Every Python model must have "TritonPythonModel" as the class name! def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows the model to initialize any state associated with this model. Parameters ---------- args : dict Both keys and values are strings. The dictionary keys and values are: * model_config: A JSON string containing the model configuration * model_instance_kind: A string containing model instance kind * model_instance_device_id: A string containing model instance device ID * model_repository: Model repository path * model_version: Model version * model_name: Model name """ self.output_dtype = pb_utils.triton_string_to_numpy( pb_utils.get_output_config_by_name( json.loads(args['model_config']), "output" )['data_type'] ) from transformers import BartTokenizer, BartModel self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') self.model = BartModel.from_pretrained('facebook/bart-large').cuda() # from transformers import T5ForConditionalGeneration, T5Tokenizer # self.model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda() #print("TritonPythonModel initialized") logger.info("TritonPythonModel initialized") def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference is requested for this model. Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") # outputs = model(**inputs) for request in requests: input_ids = pb_utils.get_input_tensor_by_name(request, "input_ids") logger.info(">>> inputs ids: %d" % input_ids) # input_ids = input_ids.as_numpy() input_ids = torch.as_tensor(input_ids).long().cuda() attention_mask = pb_utils.get_input_tensor_by_name(request, "attention_mask") # attention_mask = attention_mask.as_numpy() attention_mask = torch.as_tensor(attention_mask).long().cuda() inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} translation = self.model.generate(**inputs, num_beams=1) # Convert to numpy array on cpu: # np_translation = translation.cpu().int().detach().numpy() np_translation = translation.cpu().int().detach() inference_response = pb_utils.InferenceResponse( output_tensors=[ pb_utils.Tensor( "output", np_translation.astype(self.output_dtype) ) ] ) responses.append(inference_response) return responses def finalize(self): """`finalize` is called only once when the model is being unloaded. Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ print('Cleaning up...')