engine = DeepSpeed # passing extra options to model.py or built-in handler job_queue_size=100 batch_size=1 max_batch_delay=1 max_idle_time=60 #gpu.minWorkers=4 #gpu.maxWorkers=4 # defines custom environment variables #env=SERVING_NUMBER_OF_NETTY_THREADS=2 # Allows to load DeepSpeed workers in parallel option.parallel_loading=true # specify tensor parallel degree (number of partitions) option.tensor_parallel_degree=2 # specify per model timeout option.model_loading_timeout=600 #option.predict_timeout=240 # mark the model as failure after python process crashing 10 times retry_threshold=0