Path: blob/main/sagemaker/18_inferentia_inference/code/inference.py
4737 views
import os1from transformers import AutoConfig, AutoTokenizer2import torch3import torch.neuron45# To use one neuron core per worker6os.environ["NEURON_RT_NUM_CORES"] = "1"78# saved weights name9AWS_NEURON_TRACED_WEIGHTS_NAME = "neuron_model.pt"101112def model_fn(model_dir):13# load tokenizer and neuron model from model_dir14tokenizer = AutoTokenizer.from_pretrained(model_dir)15model = torch.jit.load(os.path.join(model_dir, AWS_NEURON_TRACED_WEIGHTS_NAME))16model_config = AutoConfig.from_pretrained(model_dir)1718return model, tokenizer, model_config192021def predict_fn(data, model_tokenizer_model_config):22# destruct model, tokenizer and model config23model, tokenizer, model_config = model_tokenizer_model_config2425# create embeddings for inputs26inputs = data.pop("inputs", data)27embeddings = tokenizer(28inputs,29return_tensors="pt",30max_length=model_config.traced_sequence_length,31padding="max_length",32truncation=True,33)34# convert to tuple for neuron model35neuron_inputs = tuple(embeddings.values())3637# run prediciton38with torch.no_grad():39predictions = model(*neuron_inputs)[0]40scores = torch.nn.Softmax(dim=1)(predictions)4142# return dictonary, which will be json serializable43return [{"label": model_config.id2label[item.argmax().item()], "score": item.max().item()} for item in scores]444546