Source code for vllm_module.vllm_module
# Copyright 2025 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#This module acts as a lightweight gateway to OpenAI-compatible APIs.
#You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences.
#It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup.
from typing import Dict, Optional, List
[docs]
class VLLMModule:
"""
VLLMModule
This module provides a lightweight wrapper for deploying a vLLM
(OpenAI-compatible) large language model server as an MLRun application runtime.
The VLLMModule is responsible for:
- Creating an MLRun application runtime based on a vLLM container image
- Configuring GPU resources, memory limits, and Kubernetes node selection
- Launching the model using `vllm serve` with configurable runtime flags
- Supporting multi-GPU inference via tensor parallelism
- Automatically configuring shared memory (/dev/shm) when using multiple GPUs
- Exposing an OpenAI-compatible API (e.g. /v1/chat/completions) for inference
- Providing a simple Python interface for deployment and invocation from Jupyter notebooks
The module is designed to be used in Jupyter notebooks and MLRun pipelines,
allowing users to deploy and test large language models on Kubernetes
with minimal configuration.
"""
def __init__(
self,
project: str,
*,
node_selector: Optional[Dict[str, str]] = None,
name: str = "vllm",
image: str = "vllm/vllm-openai:latest",
model: str = "Qwen/Qwen2.5-Omni-3B",
gpus: int = 1,
mem: str = "10G",
port: int = 8000,
dtype: str = "auto",
uvicorn_log_level: str = "info",
max_tokens: int = 500,
):
if gpus < 1:
raise ValueError("gpus must be >= 1")
if node_selector is None:
node_selector = {"alpha.eksctl.io/nodegroup-name": "added-gpu"}
if not isinstance(max_tokens, int):
raise TypeError("max_tokens must be an integer")
if max_tokens < 1:
raise ValueError("max_tokens must be >= 1")
self.project = project
self.name = name
self.image = image
self.model = model
self.gpus = gpus
self.mem = mem
self.node_selector = node_selector
self.port = port
self.dtype = dtype
self.uvicorn_log_level = uvicorn_log_level
self.max_tokens = max_tokens
self.vllm_app = self.project.set_function(
name=self.name,
kind="application",
image=self.image,
)
self.vllm_app.with_limits(gpus=self.gpus, mem=self.mem)
if self.node_selector:
self.vllm_app.with_node_selection(node_selector=self.node_selector)
self.vllm_app.set_internal_application_port(self.port)
args: List[str] = [
"serve",
self.model,
"--dtype",
self.dtype,
"--port",
str(self.port),
]
if self.uvicorn_log_level:
args += ["--uvicorn-log-level", self.uvicorn_log_level]
if self.gpus > 1:
args += ["--tensor-parallel-size", str(gpus)]
# For more than one GPU you should create a share volume for the multiple GPUs
self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}]
self.vllm_app.spec.volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}]
self.vllm_app.spec.command = "vllm"
self.vllm_app.spec.args = args
self.vllm_app.spec.min_replicas = 1
self.vllm_app.spec.max_replicas = 1
[docs]
def get_runtime(self):
return self.vllm_app
[docs]
def add_args(self, extra_args: List[str]):
if not isinstance(extra_args, list) or not all(isinstance(x, str) for x in extra_args):
raise ValueError("extra_args must be a list of strings")
self.vllm_app.spec.args += extra_args