###################################################################### # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # SPDX-License-Identifier: MIT-0 # ###################################################################### from typing import Optional from fastapi import FastAPI,logger,responses from configparser import ConfigParser import torch, os, logging import importlib import platform global device global processor global device_type global model global tokenizer global logger global postprocess global default_question, default_context logger = logging.getLogger() # Read static configuration from config.properties logger.warning("\nParsing configuration ...") path_prefix = os.path.dirname(__file__) with open(path_prefix + '/../config.properties') as f: config_lines = '[global]\n' + f.read() f.close() config = ConfigParser() config.read_string(config_lines) model_name = config['global']['huggingface_model_name'] tokenizer_class_name = config['global']['huggingface_tokenizer_class'] model_class_name = config['global']['huggingface_model_class'] sequence_length=config['global']['sequence_length'] processor=config['global']['processor'] pipeline_cores=config['global']['pipeline_cores'] batch_size=config['global']['batch_size'] default_question = "What does the little engine say" default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" # Read runtime configuration from environment postprocess=True if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']): postprocess=False quiet=False if (os.getenv("QUIET","False").lower() in ['true','1']): quiet=True num_models=1 try: num_models=int(os.getenv("NUM_MODELS", '1')) except ValueError: logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}") logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1") # Detect runtime device type inf1, inf2, gpu, cpu, or arm device_type="" try: import torch_neuron device_type="inf1" except ImportError: logger.warning("Inf1 chip not detected") pass try: import torch_neuronx device_type = 'inf2' except ImportError: print('[WARN] Inf2 device not found') pass if device_type in ['inf1', 'inf2']: pass elif torch.cuda.is_available(): device_type="gpu" device = torch.device("cuda") logger.warning(torch.cuda.get_device_name(0)) else: machine=platform.uname().machine device_type="cpu" if machine == 'aarch64': device_type="arm" device = torch.device("cpu") if processor != device_type: logger.warning(f"Configured target processor {processor} differs from actual processor {device_type}") logger.warning(f"Running models on processor: {device_type}") # FastAPI server app = FastAPI() # Server healthcheck @app.get("/") async def read_root(): return {"Status": "Healthy"} # Model inference API endpoint @app.get("/predictions/{model_id}") async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context): question=seq_0 context=seq_1 status=200 if model_id in models.keys(): if not quiet: logger.warning(f"\nQuestion: {question}\n") tokenizer=tokenizers[model_id] encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) if processor=='gpu': encoded_input.to(device) model=models[model_id] model_input = (encoded_input['input_ids'], encoded_input['attention_mask']) output=model(*model_input) # This is specific to Inferentia answer_text = str(output[0]) if postprocess: answer_start = torch.argmax(output[0]) answer_end = torch.argmax(output[1])+1 if (answer_end > answer_start): answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) else: answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) if not quiet: logger.warning("\nAnswer: ") logger.warning(answer_text) else: status=404 answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" if not quiet: logger.warning(answer_text) return responses.JSONResponse(status_code=status, content={"detail": answer_text}) # Load models in memory and onto accelerator as needed model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt") logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") tokenizers={} models={} transformers = importlib.import_module("transformers") tokenizer_class = getattr(transformers, tokenizer_class_name) for i in range(num_models): model_id = 'model' + str(i) logger.warning(f" {model_id} ...") tokenizers[model_id]=tokenizer_class.from_pretrained(model_name) models[model_id] = torch.jit.load(model_path) if device_type=='gpu': model=models[model_id] model.to(device) elif device_type in ['inf1', 'inf2']: infer(model_id, default_question, default_context) logger.warning(" ... warmup completed")