diff --git a/app/api/api.py b/app/api/api.py
index da91693f..44384150 100644
--- a/app/api/api.py
+++ b/app/api/api.py
@@ -4,7 +4,7 @@
 import os.path
 import app.api.globals as cms_globals
 
-from typing import Dict, Any, Optional, Union, Type
+from typing import Dict, Any, Optional
 from concurrent.futures import ThreadPoolExecutor
 from anyio.lowlevel import RunVar
 from anyio import CapacityLimiter
@@ -18,9 +18,14 @@
 from app.api.auth.db import make_sure_db_and_tables
 from app.api.auth.users import Props
 from app.api.dependencies import ModelServiceDep
-from app.api.utils import add_exception_handlers, add_rate_limiter, init_vllm_engine
+from app.api.utils import (
+    add_exception_handlers,
+    add_rate_limiter,
+    init_vllm_engine,
+    init_sglang_engine,
+    ForwardedPrefixMiddleware,
+)
 from app.config import Settings
-from app.domain import Tags, TagsStreamable, TagsGenerative
 from app.management.tracker_client import TrackerClient
 from app.utils import get_settings, unpack_model_data_package, get_model_data_package_base_name
 from app.exception import ConfigurationException
@@ -29,7 +34,6 @@
 logging.getLogger("asyncio").setLevel(logging.ERROR)
 logger = logging.getLogger("cms")
 
-
 def get_model_server(config: Settings, msd_overwritten: Optional[ModelServiceDep] = None) -> FastAPI:
     """
     Initialises a FastAPI app instance configured for the CMS model service.
@@ -111,7 +115,10 @@ def get_stream_server(config: Settings, msd_overwritten: Optional[ModelServiceDe
     return app
 
 
-def get_generative_server(config: Settings, msd_overwritten: Optional[ModelServiceDep] = None) -> FastAPI:
+def get_generative_server(
+    config: Settings,
+    msd_overwritten: Optional[ModelServiceDep] = None,
+) -> FastAPI:
     """
     Initialises a FastAPI instance configured for a generative server.
 
@@ -134,6 +141,9 @@ def get_generative_server(config: Settings, msd_overwritten: Optional[ModelServi
     if config.ENABLE_TRAINING_APIS == "true":
         app = _load_supervised_training_router(app)
         logger.debug("Supervised training router loaded")
+        if config.DISABLE_UNSUPERVISED_TRAINING != "true":
+            app = _load_unsupervised_training_router(app)
+            logger.debug("Unsupervised training router loaded")
         app = _load_training_operations(app)
 
     if config.AUTH_USER_ENABLED == "true":
@@ -147,7 +157,13 @@ def get_generative_server(config: Settings, msd_overwritten: Optional[ModelServi
 
     return app
 
-def get_vllm_server(config: Settings, model_package_path: str, model_name: str, log_level: str = "info") -> FastAPI:
+def get_vllm_server(
+    config: Settings,
+    model_package_path: str,
+    model_name: str,
+    log_level: str = "info",
+    server_args: Optional[str] = None,
+) -> FastAPI:
     """
     Initialises a FastAPI instance configured for a vLLM server.
 
@@ -155,17 +171,57 @@ def get_vllm_server(config: Settings, model_package_path: str, model_name: str,
         config (Settings): The CMS configuration.
         model_package_path (str): The path to the model package file.
         model_name (str): The name of the model.
-        log_level (str): The log level for the VLLM engine. Default to "info".
+        log_level (str): The log level for the vLLM engine. Default to "info".
+        server_args (Optional[str]): The arguments to pass to the vLLM engine.
+
+    Returns:
+        FastAPI: A FastAPI app instance.
+    """
+
+    app = _get_app(None, streamable=False)
+    model_dir_path = os.path.join(
+        os.path.dirname(model_package_path), get_model_data_package_base_name(model_package_path)
+    )
+    if unpack_model_data_package(model_package_path, model_dir_path):
+        async def _startup() -> None:
+            await init_vllm_engine(app, config, model_dir_path, model_name, log_level, server_args)
+
+        app.add_event_handler("startup", _startup)
+    else:
+        raise ConfigurationException(f"Model package archive format is not supported: {model_package_path}")
+
+    return app
 
+
+def get_sglang_server(
+    config: Settings,
+    model_package_path: str,
+    model_name: str,
+    log_level: str = "info",
+    server_args: Optional[str] = None,
+) -> FastAPI:
+    """
+    Initialises a FastAPI instance configured for an SGLang server.
+
+    Args:
+        config (Settings): The CMS configuration.
+        model_package_path (str): The path to the model package file.
+        model_name (str): The name of the model.
+        log_level (str): The log level for the SGLang engine. Default to "info".
+        server_args (Optional[str]): The arguments to pass to the SGLang engine.
     Returns:
         FastAPI: A FastAPI app instance.
     """
 
     app = _get_app(None, streamable=False)
-    model_dir_path = os.path.join(os.path.dirname(model_package_path), get_model_data_package_base_name(model_package_path))
+    model_dir_path = os.path.join(
+        os.path.dirname(model_package_path), get_model_data_package_base_name(model_package_path)
+    )
     if unpack_model_data_package(model_package_path, model_dir_path):
-        loop = asyncio.get_event_loop()
-        app = loop.run_until_complete(init_vllm_engine(app, model_dir_path, model_name, log_level))
+        async def _startup() -> None:
+            await init_sglang_engine(app, config, model_dir_path, model_name, log_level, server_args)
+
+        app.add_event_handler("startup", _startup)
     else:
         raise ConfigurationException(f"Model package archive format is not supported: {model_package_path}")
 
@@ -204,32 +260,21 @@ def _get_app(
     generative: bool = False,
 ) -> FastAPI:
     config = get_settings()
-    tags: Union[Type[Tags], Type[TagsStreamable], Type[TagsGenerative]]
-    if generative:
-        tags = TagsGenerative
-    elif streamable:
-        tags = TagsStreamable
-    else:
-        tags = Tags
-    tags_metadata = [{
-        "name": tag.name,
-        "description": tag.value
-    } for tag in tags]
+
     app = FastAPI(
         title="CogStack ModelServe",
         summary="A model serving and governance system for CogStack NLP solutions",
         docs_url=None,
         redoc_url=None,
         debug=(config.DEBUG == "true"),
-        openapi_tags=tags_metadata,
     )
+
+    app.add_middleware(ForwardedPrefixMiddleware)   # type: ignore
     add_exception_handlers(app)
 
-    instrumentator = None
-    if not generative:
-        instrumentator = Instrumentator(
-            excluded_handlers=["/docs", "/redoc", "/metrics", "/openapi.json", "/favicon.ico", "none"]
-        ).instrument(app)
+    instrumentator = Instrumentator(
+        excluded_handlers=["/docs", "/redoc", "/metrics", "/openapi.json", "/favicon.ico", "none"]
+    ).instrument(app)
 
     if msd_overwritten is not None:
         cms_globals.model_service_dep = msd_overwritten
@@ -279,8 +324,9 @@ async def redoc_doc(req: Request) -> HTMLResponse:
         )
 
     @app.get("/", include_in_schema=False)
-    async def root_redirect() -> RedirectResponse:
-        return RedirectResponse(url="/docs")
+    async def root_redirect(req: Request) -> RedirectResponse:
+        root_path = req.scope.get("root_path", "").rstrip("/")
+        return RedirectResponse(url=f"{req.url.scheme}://{req.url.netloc}{root_path}/docs")
 
     @app.on_event("shutdown")
     async def on_shutdown() -> None:
diff --git a/app/api/dependencies.py b/app/api/dependencies.py
index 61f3152e..fb379ff8 100644
--- a/app/api/dependencies.py
+++ b/app/api/dependencies.py
@@ -5,7 +5,6 @@
 
 from fastapi import HTTPException, Query
 from starlette.status import HTTP_400_BAD_REQUEST
-
 from typing import Optional
 from app.config import Settings
 from app.domain import ModelType
@@ -14,11 +13,10 @@
 from app.model_services.base import AbstractModelService
 from app.management.model_manager import ModelManager
 
-TRACKING_ID_REGEX = re.compile(r"^[a-zA-Z0-9][\w\-]{0,255}$")
 
+TRACKING_ID_REGEX = re.compile(r"^[a-zA-Z0-9][\w\-]{0,255}$")
 logger = logging.getLogger("cms")
 
-
 class ModelServiceDep(object):
     """Dependency class for injecting the CMS model service based on the given model type."""
 
diff --git a/app/api/routers/generative.py b/app/api/routers/generative.py
index 027492ec..d4c6e8cf 100644
--- a/app/api/routers/generative.py
+++ b/app/api/routers/generative.py
@@ -4,12 +4,12 @@
 import uuid
 import app.api.globals as cms_globals
 
-from typing import Union, Iterable, AsyncGenerator, List
+from typing import Union, Iterable, AsyncGenerator, List, Optional, Dict, Any, cast
 from typing_extensions import Annotated
 from functools import partial
 from fastapi import APIRouter, Depends, Request, Body, Query
 from fastapi.encoders import jsonable_encoder
-from fastapi.responses import PlainTextResponse, StreamingResponse, JSONResponse
+from fastapi.responses import PlainTextResponse, StreamingResponse, JSONResponse, Response
 from starlette.status import (
     HTTP_200_OK,
     HTTP_400_BAD_REQUEST,
@@ -19,28 +19,62 @@
 from app.domain import (
     Tags,
     TagsGenerative,
+    GenerationResult,
     OpenAIChatCompletionsRequest,
     OpenAIChatCompletionsResponse,
     OpenAICompletionsRequest,
     OpenAICompletionsResponse,
     OpenAIEmbeddingsRequest,
     OpenAIEmbeddingsResponse,
+    OpenAIFunctionTool,
+    OpenAIResponseFormat,
     PromptMessage,
     PromptRole,
+    OllamaChatRequest,
+    OllamaGenerateRequest,
+    OllamaShowRequest,
+    OllamaEmbedRequest,
 )
 from app.model_services.base import AbstractModelService
-from app.utils import get_settings, get_prompt_from_messages
+from app.utils import (
+    get_settings,
+    get_prompt_from_messages,
+    get_default_chat_template,
+    resolve_safe_max_model_length,
+    utilise_local_chat_template,
+    dump_pydantic_object_to_dict,
+    extract_tool_calls,
+)
 from app.api.utils import get_rate_limiter
 from app.api.dependencies import validate_tracking_id
-from app.management.prometheus_metrics import cms_prompt_tokens, cms_completion_tokens, cms_total_tokens
+from app.management.prometheus_metrics import (
+    cms_prompt_tokens,
+    cms_completion_tokens,
+    cms_total_tokens,
+    cms_ttft_milliseconds,
+    cms_tpot_milliseconds,
+)
+from app.exception import GenerationException, ClientException
+from lmformatenforcer import JsonSchemaParser
+
 
 PATH_GENERATE = "/generate"
 PATH_GENERATE_ASYNC = "/stream/generate"
-PATH_GENERATE_SSE = "/events/generate"
-PATH_CHAT_COMPLETIONS = "/v1/chat/completions"
-PATH_COMPLETIONS = "/v1/completions"
-PATH_EMBEDDINGS = "/v1/embeddings"
-PATH_MODELS = "/v1/models"
+
+# OpenAI-compatible endpoints
+PATH_CHAT_COMPLETIONS = "/openai/v1/chat/completions"
+PATH_COMPLETIONS = "/openai/v1/completions"
+PATH_EMBEDDINGS = "/openai/v1/embeddings"
+PATH_MODELS = "/openai/v1/models"
+
+# Ollama-compatible endpoints
+PATH_OLLAMA_ROOT = "/ollama/"
+PATH_OLLAMA_TAGS = "/ollama/api/tags"
+PATH_OLLAMA_CHAT = "/ollama/api/chat"
+PATH_OLLAMA_GENERATE = "/ollama/api/generate"
+PATH_OLLAMA_SHOW = "/ollama/api/show"
+PATH_OLLAMA_VERSION = "/ollama/api/version"
+PATH_OLLAMA_EMBED = "/ollama/api/embed"
 
 router = APIRouter()
 config = get_settings()
@@ -57,6 +91,7 @@
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Generate text",
 )
+@limiter.limit(config.GENERATION_RATE_LIMIT)
 def generate_text(
     request: Request,
     prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
@@ -64,7 +99,9 @@ def generate_text(
     temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
     top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
     stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
+    include_usage: Annotated[bool, Query(description="Whether to include token usage in the response")] = False,
     ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> PlainTextResponse:
@@ -78,7 +115,10 @@ def generate_text(
         temperature (float): The temperature of the generated text.
         top_p (float): The Top-P value for nucleus sampling.
         stop_sequences (List[str]): The list of sequences used to stop the generation.
+        include_usage (bool): Whether to include token usage in the response.
         ensure_full_sentences (bool): Whether to generate full sentences only.
+        chat_template  (Optional[str]): Override chat template name for prompt formatting.
+        tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
     Returns:
@@ -86,18 +126,33 @@ def generate_text(
     """
 
     tracking_id = tracking_id or str(uuid.uuid4())
+
     if prompt:
-        return PlainTextResponse(
-            model_service.generate(
+        generation_result: GenerationResult = model_service.generate(
+            _build_prompt_text(
+                model_service,
                 prompt,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stop_sequences=stop_sequences,
-                report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE),
-                ensure_full_sentences=ensure_full_sentences,
+                override_template=chat_template if chat_template else None,
             ),
-            headers={"x-cms-tracking-id": tracking_id},
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
+            report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE),
+            ensure_full_sentences=ensure_full_sentences,
+        )
+
+        return PlainTextResponse(
+            generation_result.text,
+            headers={
+                "x-cms-tracking-id": tracking_id,
+                "x-cms-gen-prompt-token-num": str(generation_result.prompt_token_num),
+                "x-cms-gen-completion-token-num": str(generation_result.completion_token_num),
+                "x-cms-gen-total-token-num": (
+                    str(generation_result.prompt_token_num + generation_result.completion_token_num)
+                ),
+                "x-cms-gen-tpot-ms": str(generation_result.tpot_ms),
+            } if include_usage else {"x-cms-tracking-id": tracking_id},
             status_code=HTTP_200_OK,
         )
     else:
@@ -115,6 +170,7 @@ def generate_text(
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Generate a stream of texts",
 )
+@limiter.limit(config.GENERATION_RATE_LIMIT)
 async def generate_text_stream(
     request: Request,
     prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
@@ -122,7 +178,9 @@ async def generate_text_stream(
     temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
     top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
     stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
+    include_usage: Annotated[bool, Query(description="Whether to include token usage in the response")] = False,
     ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> StreamingResponse:
@@ -136,7 +194,9 @@ async def generate_text_stream(
         temperature (float): The temperature of the generated text.
         top_p (float): The Top-P value for nucleus sampling.
         stop_sequences (List[str]): The list of sequences used to stop the generation.
+        include_usage (bool): Whether to include token usage in the response.
         ensure_full_sentences (bool): Whether to generate full sentences only.
+        chat_template (Optional[str]): Override chat template for prompt formatting.
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -145,17 +205,39 @@ async def generate_text_stream(
     """
 
     tracking_id = tracking_id or str(uuid.uuid4())
+    prompt = _build_prompt_text(
+        model_service,
+        prompt,
+        override_template=chat_template if chat_template else None,
+    )
+
+    async def _stream(prompt: str) -> AsyncGenerator:
+        async for generated in model_service.generate_async(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
+            report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE_ASYNC),
+            ensure_full_sentences=ensure_full_sentences,
+        ):
+            if isinstance(generated, GenerationResult):
+                if include_usage:
+                    yield (
+                       "\n\n<cms_token_usage>"
+                       f"Prompt tokens: {str(generated.prompt_token_num)}; "
+                       f"Completion tokens: {str(generated.completion_token_num)}; "
+                       f"Total tokens: {str(generated.prompt_token_num + generated.completion_token_num)}; "
+                       f"TTFT in milliseconds: {str(generated.ttft_ms)}; "
+                       f"TPOT in milliseconds: {str(generated.tpot_ms)}"
+                       "</cms_token_usage>"
+                    )
+                continue
+            yield generated
+
     if prompt:
         return StreamingResponse(
-            model_service.generate_async(
-                prompt,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stop_sequences=stop_sequences,
-                report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE_ASYNC),
-                ensure_full_sentences=ensure_full_sentences,
-            ),
+            _stream(prompt),
             media_type="text/event-stream",
             headers={"x-cms-tracking-id": tracking_id},
             status_code=HTTP_200_OK,
@@ -176,12 +258,14 @@ async def generate_text_stream(
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Generate chat response based on messages, similar to OpenAI's /v1/chat/completions",
 )
+@limiter.limit(config.GENERATION_RATE_LIMIT)
 def generate_chat_completions(
     request: Request,
     request_data: Annotated[OpenAIChatCompletionsRequest, Body(
         description="OpenAI-like completion request", media_type="application/json"
     )],
     ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> Union[StreamingResponse, JSONResponse]:
@@ -192,6 +276,7 @@ def generate_chat_completions(
         request (Request): The request object.
         request_data (OpenAIChatRequest): The request data containing model, messages, stream, temperature, top_p, and stop_sequences.
         ensure_full_sentences (bool): Whether to generate full sentences only.
+        chat_template (Optional[str]): Override chat template for prompt formatting.
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -201,11 +286,16 @@ def generate_chat_completions(
     """
 
     messages = request_data.messages
+    tools = [dump_pydantic_object_to_dict(tool) for tool in request_data.tools] if request_data.tools else None
+    tools_for_prompt = cast(Optional[List[Union[OpenAIFunctionTool, Dict[Any, Any]]]], tools)
     model = model_service.model_name if request_data.model != model_service.model_name else request_data.model
     stream = request_data.stream
+    include_usage = request_data.stream_options.include_usage if request_data.stream_options else False
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
+    json_schema_parser = _get_parser_for_response_format(request_data.response_format)
+
     if isinstance(request_data.stop, str):
         stop_sequences = [request_data.stop]
     elif isinstance(request_data.stop, list):
@@ -213,6 +303,7 @@ def generate_chat_completions(
     else:
         stop_sequences = []
     tracking_id = tracking_id or str(uuid.uuid4())
+    _ensures_chat_template(model_service, chat_template)
 
     if not messages:
         error_response = {
@@ -236,53 +327,130 @@ async def _stream(
         top_p: float,
         stop_sequences: List[str],
         ensure_full_sentences: bool,
+        prefix_prompt: Optional[str],
     ) -> AsyncGenerator:
-        data = {
+        data: Dict[str, Any] = {
             "id": tracking_id,
             "object": "chat.completion.chunk",
             "choices": [{"delta": {"role": PromptRole.ASSISTANT.value}}],
         }
         yield f"data: {json.dumps(data)}\n\n"
-        async for chunk in model_service.generate_async(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stop_sequences=stop_sequences,
-            report_tokens=partial(_send_usage_metrics, handler=PATH_CHAT_COMPLETIONS),
-            ensure_full_sentences=ensure_full_sentences,
-        ):
-            data = {
-                "choices": [
-                    {
-                        "delta": {"content": chunk}
-                    }
-                ],
-                "object": "chat.completion.chunk",
+        stream_buffer = ""
+        tool_call_emitted = False
+        try:
+            async for generated in model_service.generate_async(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=stop_sequences,
+                report_tokens=partial(_send_usage_metrics, handler=PATH_CHAT_COMPLETIONS),
+                ensure_full_sentences=ensure_full_sentences,
+                json_schema_parser=json_schema_parser,
+                prefix_prompt=prefix_prompt,
+            ):
+                if isinstance(generated, GenerationResult):
+                    if include_usage:
+                        data = {
+                            "usage": {
+                                "prompt_tokens": generated.prompt_token_num,
+                                "completion_tokens": generated.completion_token_num,
+                                "total_tokens": generated.prompt_token_num + generated.completion_token_num,
+                            }
+                        }
+                        yield f"data: {json.dumps(data)}\n\n"
+                    continue
+                if tool_call_emitted:
+                    continue
+                if tools_for_prompt:
+                    stream_buffer += generated
+                    tool_calls = extract_tool_calls(stream_buffer)
+                    if tool_calls:
+                        data = {
+                            "choices": [
+                                {
+                                    "delta": {
+                                        "tool_calls": [
+                                            {
+                                                "id": tool_call["id"],
+                                                "type": "function",
+                                                "function": tool_call["function"],
+                                            }
+                                            for tool_call in tool_calls
+                                        ]
+                                    },
+                                    "finish_reason": "tool_calls",
+                                }
+                            ],
+                            "object": "chat.completion.chunk",
+                        }
+                        yield f"data: {json.dumps(data)}\n\n"
+                        tool_call_emitted = True
+                        continue
+                data = {
+                    "choices": [
+                        {
+                            "delta": {"content": generated}
+                        }
+                    ],
+                    "object": "chat.completion.chunk",
+                }
+                yield f"data: {json.dumps(data)}\n\n"
+        except GenerationException as e:
+            logger.error("Streaming chat generation failed for tracking_id=%s", tracking_id, exc_info=e)
+            error_data = {
+                "error": {
+                    "message": str(e),
+                    "type": "generation_error",
+                }
             }
-            yield f"data: {json.dumps(data)}\n\n"
+            yield f"data: {json.dumps(error_data)}\n\n"
         yield "data: [DONE]\n\n"
 
     assert hasattr(model_service, "tokenizer"), "Model service doesn't have a tokenizer"
-    prompt = get_prompt_from_messages(model_service.tokenizer, messages)
+    prompt = get_prompt_from_messages(
+        tokenizer=model_service.tokenizer,
+        messages=messages,
+        tools=tools_for_prompt,
+        max_input_tokens=(resolve_safe_max_model_length(model_service.model.config) - max_tokens),  # type: ignore
+    )
+    prefix_prompt = None
+    if messages and messages[0].role == PromptRole.SYSTEM:
+        prefix_prompt = get_prompt_from_messages(
+            tokenizer=model_service.tokenizer,
+            messages=[messages[0]],
+            tools=tools_for_prompt,
+            add_generation_prompt=False,
+        )
     if stream:
         return StreamingResponse(
-            _stream(prompt, max_tokens, temperature, top_p, stop_sequences or [], ensure_full_sentences),
+            _stream(
+                prompt=prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=stop_sequences or [],
+                ensure_full_sentences=ensure_full_sentences,
+                prefix_prompt=prefix_prompt,
+            ),
             media_type="text/event-stream",
             headers={"x-cms-tracking-id": tracking_id},
         )
     else:
-        usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
-        def _report_tokens(prompt_token_num: int, completion_token_num: int) -> None:
-            usage["prompt_tokens"] = prompt_token_num
-            usage["completion_tokens"] = completion_token_num
-            usage["total_tokens"] = prompt_token_num + completion_token_num
+        def _report_tokens(
+            prompt_token_num: int,
+            completion_token_num: int,
+            ttft_milliseconds: int = -1,
+            tpot_milliseconds: int = -1,
+        ) -> None:
             _send_usage_metrics(
                 handler=PATH_CHAT_COMPLETIONS,
                 prompt_token_num=prompt_token_num,
                 completion_token_num=completion_token_num,
+                ttft_milliseconds=ttft_milliseconds,
+                tpot_milliseconds=tpot_milliseconds,
             )
-        generated_text = model_service.generate(
+        generation_result = model_service.generate(
             prompt,
             max_tokens=max_tokens,
             temperature=temperature,
@@ -290,23 +458,45 @@ def _report_tokens(prompt_token_num: int, completion_token_num: int) -> None:
             stop_sequences=stop_sequences or [],
             report_tokens=_report_tokens,
             ensure_full_sentences=ensure_full_sentences,
+            json_schema_parser=json_schema_parser,
+            prefix_prompt=prefix_prompt,
         )
-        completion = OpenAIChatCompletionsResponse(
-            id=tracking_id,
-            object="chat.completion",
-            created=int(time.time()),
-            model=model,
-            choices=[
+        tool_calls = extract_tool_calls(generation_result.text) if tools_for_prompt else []
+        if tool_calls:
+            choices = [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": PromptRole.ASSISTANT.value,
+                        "content": None,
+                        "tool_calls": tool_calls,
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ]
+        else:
+            choices = [
                 {
                     "index": 0,
                     "message": PromptMessage(
                         role=PromptRole.ASSISTANT,
-                        content=generated_text,
+                        content=generation_result.text,
                     ),
                     "finish_reason": "stop",
                 }
-            ],
-            usage=usage,
+            ]
+
+        completion = OpenAIChatCompletionsResponse(
+            id=tracking_id,
+            object="chat.completion",
+            created=int(time.time()),
+            model=model,
+            choices=choices,
+            usage={
+                "prompt_tokens": generation_result.prompt_token_num,
+                "completion_tokens": generation_result.completion_token_num,
+                "total_tokens": generation_result.prompt_token_num + generation_result.completion_token_num,
+            } if include_usage else None,
         )
         return JSONResponse(content=jsonable_encoder(completion), headers={"x-cms-tracking-id": tracking_id})
 
@@ -318,12 +508,14 @@ def _report_tokens(prompt_token_num: int, completion_token_num: int) -> None:
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Generate completion based on prompt, similar to OpenAI's /v1/completions",
 )
+@limiter.limit(config.GENERATION_RATE_LIMIT)
 def generate_text_completions(
     request: Request,
     request_data: Annotated[OpenAICompletionsRequest, Body(
         description="OpenAI-like completion request", media_type="application/json"
     )],
     ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> Union[StreamingResponse, JSONResponse]:
@@ -334,6 +526,7 @@ def generate_text_completions(
         request (Request): The request object.
         request_data (OpenAICompletionsRequest): The request data containing model, prompt, stream, temperature, top_p, and stop.
         ensure_full_sentences (bool): Whether to generate full sentences only.
+        chat_template (Optional[str]): Override chat template for prompt formatting
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -345,6 +538,7 @@ def generate_text_completions(
     tracking_id = tracking_id or str(uuid.uuid4())
     model = model_service.model_name if request_data.model != model_service.model_name else request_data.model
     stream = request_data.stream
+    include_usage = request_data.stream_options.include_usage if request_data.stream_options else False
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
@@ -384,78 +578,112 @@ async def _stream(
         stop_sequences: List[str],
         ensure_full_sentences: bool,
     ) -> AsyncGenerator:
-        data = {
+        data: Dict[str, Any] = {
             "id": tracking_id,
             "object": "text_completion",
             "choices": [{"text": "", "index": 0, "logprobs": None, "finish_reason": None}],
         }
         yield f"data: {json.dumps(data)}\n\n"
-        async for chunk in model_service.generate_async(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stop_sequences=stop_sequences,
-            report_tokens=partial(_send_usage_metrics, handler=PATH_COMPLETIONS),
-            ensure_full_sentences=ensure_full_sentences,
-        ):
-            data = {
-                "object": "text_completion",
-                "choices": [
-                    {
-                        "text": chunk,
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": None,
-                    }
-                ],
+        try:
+            async for generated in model_service.generate_async(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=stop_sequences,
+                report_tokens=partial(_send_usage_metrics, handler=PATH_COMPLETIONS),
+                ensure_full_sentences=ensure_full_sentences,
+            ):
+                if isinstance(generated, GenerationResult):
+                    if include_usage:
+                        data = {
+                            "usage": {
+                                "prompt_tokens": generated.prompt_token_num,
+                                "completion_tokens": generated.completion_token_num,
+                                "total_tokens": generated.prompt_token_num + generated.completion_token_num,
+                            }
+                        }
+                        yield f"data: {json.dumps(data)}\n\n"
+                    continue
+                data = {
+                    "object": "text_completion",
+                    "choices": [
+                        {
+                            "text": generated,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                yield f"data: {json.dumps(data)}\n\n"
+        except GenerationException as e:
+            logger.error("Streaming completion generation failed for tracking_id=%s", tracking_id, exc_info=e)
+            error_data = {
+                "error": {
+                    "message": str(e),
+                    "type": "generation_error",
+                }
             }
-            yield f"data: {json.dumps(data)}\n\n"
+            yield f"data: {json.dumps(error_data)}\n\n"
         yield "data: [DONE]\n\n"
 
+    prompt = _build_prompt_text(
+        model_service,
+        prompt,
+        override_template=chat_template if chat_template else None,
+    )
     if stream:
         return StreamingResponse(
             _stream(prompt, max_tokens, temperature, top_p, stop_sequences, ensure_full_sentences),
             media_type="text/event-stream",
             headers={"x-cms-tracking-id": tracking_id},
         )
+    else:
+        def _report_tokens(
+            prompt_token_num: int,
+            completion_token_num: int,
+            ttft_milliseconds: int = -1,
+            tpot_milliseconds: int = -1,
+        ) -> None:
+            _send_usage_metrics(
+                handler=PATH_COMPLETIONS,
+                prompt_token_num=prompt_token_num,
+                completion_token_num=completion_token_num,
+                ttft_milliseconds=ttft_milliseconds,
+                tpot_milliseconds=tpot_milliseconds,
+            )
 
-    usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
-    def _report_tokens(prompt_token_num: int, completion_token_num: int) -> None:
-        usage["prompt_tokens"] = prompt_token_num
-        usage["completion_tokens"] = completion_token_num
-        usage["total_tokens"] = prompt_token_num + completion_token_num
-        _send_usage_metrics(
-            handler=PATH_COMPLETIONS,
-            prompt_token_num=prompt_token_num,
-            completion_token_num=completion_token_num,
+        generation_result = model_service.generate(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
+            report_tokens=_report_tokens,
+            ensure_full_sentences=ensure_full_sentences,
         )
-    generated_text = model_service.generate(
-        prompt,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stop_sequences=stop_sequences,
-        report_tokens=_report_tokens,
-        ensure_full_sentences=ensure_full_sentences,
-    )
 
-    completion = OpenAICompletionsResponse(
-        id=tracking_id,
-        object="text_completion",
-        created=int(time.time()),
-        model=model,
-        choices=[
-            {
-                "index": 0,
-                "text": generated_text,
-                "logprobs": None,
-                "finish_reason": "stop",
-            }
-        ],
-        usage=usage,
-    )
-    return JSONResponse(content=jsonable_encoder(completion), headers={"x-cms-tracking-id": tracking_id})
+        completion = OpenAICompletionsResponse(
+            id=tracking_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=model,
+            choices=[
+                {
+                    "index": 0,
+                    "text": generation_result.text,
+                    "logprobs": None,
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": generation_result.prompt_token_num,
+                "completion_tokens": generation_result.completion_token_num,
+                "total_tokens": generation_result.prompt_token_num + generation_result.completion_token_num,
+            } if include_usage else None,
+        )
+        return JSONResponse(content=jsonable_encoder(completion), headers={"x-cms-tracking-id": tracking_id})
 
 
 @router.post(
@@ -550,7 +778,8 @@ def embed_texts(
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="List available models, similar to OpenAI's /v1/models endpoint",
 )
-def list_models(
+async def list_models(
+    request: Request,
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> JSONResponse:
     """
@@ -582,7 +811,8 @@ def list_models(
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Get a specific model, similar to OpenAI's /v1/models/{model_id} endpoint",
 )
-def get_model(
+async def get_model(
+    request: Request,
     model_name: str,
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> JSONResponse:
@@ -619,14 +849,476 @@ def get_model(
     return JSONResponse(content=response)
 
 
+@router.get(
+    PATH_OLLAMA_ROOT,
+    tags=[Tags.OllamaCompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Health check, similar to Ollama's / endpoint (GET)",
+)
+async def ollama_health_get(request: Request) -> JSONResponse:
+    return JSONResponse(content={"status": "ok"}, status_code=HTTP_200_OK)
+
+
+@router.head(
+    PATH_OLLAMA_ROOT,
+    tags=[Tags.OllamaCompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Health check, similar to Ollama's / endpoint (HEAD)",
+)
+async def ollama_health_head(request: Request) -> Response:
+    return Response(status_code=HTTP_200_OK)
+
+
+@router.get(
+    PATH_OLLAMA_VERSION,
+    tags=[Tags.OllamaCompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Get the API version, similar to Ollama's /api/version endpoint",
+)
+async def ollama_version(
+    request: Request,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> JSONResponse:
+    return JSONResponse(content={"version": model_service.api_version}) # type: ignore
+
+
+@router.get(
+    PATH_OLLAMA_TAGS,
+    tags=[Tags.OllamaCompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="List available models, similar to Ollama's /api/tags endpoint",
+)
+async def ollama_list_tags(
+    request: Request,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> JSONResponse:
+    model_name = model_service.model_name.replace(" ", "_")
+    response = {
+        "models": [
+            {
+                "name": model_name,
+                "model": model_name,
+                "digest": model_service.digest, # type: ignore
+                "details": {
+                    "format": "cmsmp",
+                    "family": model_service.model.config.model_type,    # type: ignore
+                    "families": [model_service.model.config.model_type],    # type: ignore
+                },
+            }
+        ]
+    }
+    return JSONResponse(content=response)
+
+
+@router.post(
+    PATH_OLLAMA_SHOW,
+    tags=[Tags.OllamaCompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Show model information, similar to Ollama's /api/show endpoint",
+)
+async def ollama_show_model(
+    request: Request,
+    request_data: Annotated[OllamaShowRequest, Body(description="Ollama show request", media_type="application/json")],
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> JSONResponse:
+    requested_model_name = request_data.model
+    model_name = model_service.model_name.replace(" ", "_")
+    if requested_model_name and requested_model_name != model_name:
+        return JSONResponse(
+            content={"error": f"model '{requested_model_name}' not found"},
+            status_code=HTTP_404_NOT_FOUND,
+        )
+
+    model_card = model_service.info().model_card
+
+    response = {
+        "modelfile": model_service.info().model_card.get(   # type: ignore
+            "_name_or_path", model_service.model_name
+        ),
+        "template": model_service.tokenizer.chat_template,  # type: ignore
+        "details": {
+            "family": model_service.model.config.model_type,    # type: ignore
+        },
+        "model_info": model_card,
+        "capabilities": ["completion", "chat", "create_embeddings"]
+    }
+    return JSONResponse(content=response)
+
+
+@router.post(
+    PATH_OLLAMA_GENERATE,
+    tags=[Tags.OllamaCompatible],
+    response_model=None,
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Generate a completion, similar to Ollama's /api/generate endpoint",
+)
+@limiter.limit(config.GENERATION_RATE_LIMIT)
+async def ollama_generate(
+    request: Request,
+    request_data: Annotated[
+        OllamaGenerateRequest, Body(description="Ollama generate request", media_type="application/json")
+    ],
+    ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> Union[StreamingResponse, JSONResponse]:
+    prompt = request_data.prompt
+    if not prompt:
+        return JSONResponse(content={"error": "prompt is required"}, status_code=HTTP_400_BAD_REQUEST)
+
+    stream = request_data.stream
+    model_name = model_service.model_name.replace(" ", "_")
+    options = request_data.options.model_dump(exclude_none=True) if request_data.options else {}
+    max_tokens = options.get("num_predict", 512)
+    temperature = options.get("temperature", 0.7)
+    top_p = options.get("top_p", 0.9)
+    stop_sequences = _normalise_stop_sequences(options.get("stop", None))
+    json_schema_parser = _get_parser_for_json_schema(request_data.format)
+
+    def _report_tokens(
+        prompt_token_num: int,
+        completion_token_num: int,
+        ttft_milliseconds: int = -1,
+        tpot_milliseconds: int = -1,
+    ) -> None:
+        _send_usage_metrics(
+            PATH_OLLAMA_GENERATE,
+            prompt_token_num,
+            completion_token_num,
+            ttft_milliseconds=ttft_milliseconds,
+            tpot_milliseconds=tpot_milliseconds,
+        )
+
+    prompt = _build_prompt_text(
+        model_service,
+        prompt,
+        override_template=chat_template if chat_template else None,
+    )
+    if stream:
+        async def _stream() -> AsyncGenerator[str, None]:
+            start = time.perf_counter_ns()
+            generation_result: Optional[GenerationResult] = None
+            try:
+                async for generated in model_service.generate_async(
+                    prompt,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    stop_sequences=stop_sequences,
+                    report_tokens=_report_tokens,
+                    ensure_full_sentences=ensure_full_sentences,
+                    json_schema_parser=json_schema_parser,
+                ):
+                    if isinstance(generated, GenerationResult):
+                        generation_result = generated
+                    else:
+                        yield json.dumps({
+                            "model": model_name,
+                            "created_at": _iso_utc_now(),
+                            "response": generated,
+                            "done": False,
+                        }) + "\n"
+            except GenerationException as e:
+                logger.error("Ollama stream generation failed", exc_info=e)
+                yield json.dumps({
+                    "model": model_name,
+                    "created_at": _iso_utc_now(),
+                    "response": "",
+                    "done": True,
+                    "done_reason": "error",
+                    "error": str(e),
+                    "total_duration": time.perf_counter_ns() - start,
+                }) + "\n"
+                return
+            yield json.dumps({
+                "model": model_name,
+                "created_at": _iso_utc_now(),
+                "response": "",
+                "done": True,
+                "done_reason": "stop",
+                "prompt_eval_count": generation_result.prompt_token_num if generation_result is not None else 0,
+                "eval_count": generation_result.completion_token_num if generation_result is not None else 0,
+                "ttft_in_milliseconds": generation_result.ttft_ms if generation_result is not None else -1,
+                "tpot_in_milliseconds": generation_result.tpot_ms if generation_result is not None else -1,
+                "total_duration": time.perf_counter_ns() - start,
+            }) + "\n"
+
+        return StreamingResponse(_stream(), media_type="application/x-ndjson")
+    else:
+        start = time.perf_counter_ns()
+        generation_result = model_service.generate(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
+            report_tokens=_report_tokens,
+            ensure_full_sentences=ensure_full_sentences,
+            json_schema_parser=json_schema_parser,
+        )
+        return JSONResponse(content={
+            "model": model_name,
+            "created_at": _iso_utc_now(),
+            "response": generation_result.text,
+            "done": True,
+            "done_reason": "stop",
+            "prompt_eval_count": generation_result.prompt_token_num,
+            "eval_count": generation_result.completion_token_num,
+            "ttft_in_milliseconds": generation_result.ttft_ms,
+            "tpot_in_milliseconds": generation_result.tpot_ms,
+            "total_duration": time.perf_counter_ns() - start,
+        })
+
+
+@router.post(
+    PATH_OLLAMA_CHAT,
+    tags=[Tags.OllamaCompatible],
+    response_model=None,
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Generate a chat completion, similar to Ollama's /api/chat endpoint",
+)
+@limiter.limit(config.GENERATION_RATE_LIMIT)
+async def ollama_chat(
+    request: Request,
+    request_data: Annotated[OllamaChatRequest, Body(description="Ollama chat request", media_type="application/json")],
+    ensure_full_sentences: Annotated[bool, Query(description="Whether to generate full sentences only")] = False,
+    chat_template: Annotated[Optional[str], Query(description="Override chat template for prompt formatting")] = None,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> Union[StreamingResponse, JSONResponse]:
+    raw_messages = request_data.messages
+    if not raw_messages:
+        return JSONResponse(content={"error": "messages are required"}, status_code=HTTP_400_BAD_REQUEST)
+
+    stream = request_data.stream
+    model_name = model_service.model_name.replace(" ", "_")
+    options = request_data.options.model_dump(exclude_none=True) if request_data.options else {}
+    max_tokens = options.get("num_predict", 512)
+    temperature = options.get("temperature", 0.7)
+    top_p = options.get("top_p", 0.9)
+    stop_sequences = _normalise_stop_sequences(options.get("stop", None))
+    json_schema_parser = _get_parser_for_json_schema(request_data.format)
+    _ensures_chat_template(model_service, chat_template)
+
+    def _report_tokens(
+        prompt_token_num: int,
+        completion_token_num: int,
+        ttft_milliseconds: int = -1,
+        tpot_milliseconds: int = -1,
+    ) -> None:
+        _send_usage_metrics(
+            PATH_OLLAMA_CHAT,
+            prompt_token_num,
+            completion_token_num,
+            ttft_milliseconds=ttft_milliseconds,
+            tpot_milliseconds=tpot_milliseconds,
+        )
+
+    prompt_messages: List[PromptMessage] = []
+    for message in raw_messages:
+        role_text = message.role
+        try:
+            role = PromptRole(role_text)
+        except ValueError:
+            role = PromptRole.USER
+        prompt_messages.append(PromptMessage(role=role, content=str(message.content)))
+    prompt = get_prompt_from_messages(
+        tokenizer=model_service.tokenizer,  # type: ignore
+        messages=prompt_messages,
+        max_input_tokens=(resolve_safe_max_model_length(model_service.model.config) - max_tokens),  # type: ignore
+    )
+    prefix_prompt = None
+    if prompt_messages and prompt_messages[0].role == PromptRole.SYSTEM:
+        prefix_prompt = get_prompt_from_messages(
+            tokenizer=model_service.tokenizer,  # type: ignore
+            messages=[prompt_messages[0]],
+            add_generation_prompt=False,
+        )
+
+    if stream:
+        start = time.perf_counter_ns()
+
+        async def _stream() -> AsyncGenerator[str, None]:
+            generated_result: Optional[GenerationResult] = None
+            try:
+                async for generated in model_service.generate_async(
+                    prompt,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    stop_sequences=stop_sequences,
+                    report_tokens=_report_tokens,
+                    ensure_full_sentences=ensure_full_sentences,
+                    json_schema_parser=json_schema_parser,
+                    prefix_prompt=prefix_prompt,
+                ):
+                    if isinstance(generated, GenerationResult):
+                        generated_result = generated
+                    else:
+                        yield json.dumps({
+                            "model": model_name,
+                            "created_at": _iso_utc_now(),
+                            "message": {"role": PromptRole.ASSISTANT.value, "content": generated},
+                            "done": False,
+                        }) + "\n"
+            except GenerationException as e:
+                logger.error("Ollama chat stream generation failed", exc_info=e)
+                yield json.dumps({
+                    "model": model_name,
+                    "created_at": _iso_utc_now(),
+                    "message": {"role": PromptRole.ASSISTANT.value, "content": ""},
+                    "done": True,
+                    "done_reason": "error",
+                    "error": str(e),
+                    "total_duration": time.perf_counter_ns() - start,
+                }) + "\n"
+                return
+            yield json.dumps({
+                "model": model_name,
+                "created_at": _iso_utc_now(),
+                "message": {"role": PromptRole.ASSISTANT.value, "content": ""},
+                "done": True,
+                "done_reason": "stop",
+                "prompt_eval_count": generated_result.prompt_token_num if generated_result is not None else 0,
+                "eval_count": generated_result.completion_token_num if generated_result is not None else 0,
+                "ttft_in_milliseconds": generated_result.ttft_ms if generated_result is not None else -1,
+                "tpot_in_milliseconds": generated_result.tpot_ms if generated_result is not None else -1,
+                "total_duration": time.perf_counter_ns() - start,
+            }) + "\n"
+
+        return StreamingResponse(_stream(), media_type="application/x-ndjson")
+    else:
+        start = time.perf_counter_ns()
+        generated_result = model_service.generate(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
+            report_tokens=_report_tokens,
+            ensure_full_sentences=ensure_full_sentences,
+            json_schema_parser=json_schema_parser,
+            prefix_prompt=prefix_prompt,
+        )
+
+        return JSONResponse(content={
+            "model": model_name,
+            "created_at": _iso_utc_now(),
+            "message": {"role": PromptRole.ASSISTANT.value, "content": generated_result.text},
+            "done": True,
+            "done_reason": "stop",
+            "prompt_eval_count": generated_result.prompt_token_num,
+            "eval_count": generated_result.completion_token_num,
+            "ttft_in_milliseconds": generated_result.ttft_ms,
+            "tpot_in_milliseconds": generated_result.tpot_ms,
+            "total_duration": time.perf_counter_ns() - start,
+        })
+
+
+@router.post(
+    PATH_OLLAMA_EMBED,
+    tags=[Tags.OllamaCompatible],
+    response_model=None,
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Create embeddings, similar to Ollama's /api/embed endpoint",
+)
+def ollama_embed(
+    request: Request,
+    request_data: Annotated[OllamaEmbedRequest, Body(description="Ollama embed request", media_type="application/json")],
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> JSONResponse:
+    inputs = [request_data.input] if isinstance(request_data.input, str) else request_data.input
+    start = time.perf_counter_ns()
+    embeddings = model_service.create_embeddings(inputs)
+    return JSONResponse(content={
+        "model": model_service.model_name.replace(" ", "_"),
+        "embeddings": embeddings,
+        "total_duration": time.perf_counter_ns() - start,
+    })
+
+
 def _empty_prompt_error() -> Iterable[str]:
     yield "ERROR: No prompt text provided\n"
 
 
-def _send_usage_metrics(handler: str, prompt_token_num: int, completion_token_num: int) -> None:
+def _ensures_chat_template(
+    model_service: AbstractModelService,
+    override_template: Optional[str],
+) -> None:
+    assert hasattr(model_service, "tokenizer"), "Model service doesn't have a tokenizer"
+    if override_template:
+        model_service.tokenizer.chat_template = override_template  # type: ignore
+        return
+    if hasattr(model_service.tokenizer, "chat_template") and model_service.tokenizer.chat_template is None:  # type: ignore
+        model_type = model_service.model.config.model_type  # type: ignore
+        used_local_template = utilise_local_chat_template(model_type, model_service.tokenizer)  # type: ignore
+        if not used_local_template:
+            model_service.tokenizer.chat_template = get_default_chat_template()  # type: ignore
+
+
+def _build_prompt_text(
+    model_service: AbstractModelService,
+    prompt: str,
+    override_template: Optional[str] = None,
+) -> str:
+    _ensures_chat_template(model_service, override_template)
+    return get_prompt_from_messages(
+        tokenizer=model_service.tokenizer,  # type: ignore
+        messages=[PromptMessage(role=PromptRole.USER, content=prompt)],
+        add_generation_prompt=True,
+    )
+
+
+def _get_parser_for_response_format(response_format: Optional[OpenAIResponseFormat]) -> Optional[JsonSchemaParser]:
+    if response_format is None:
+        return None
+    if response_format.type == "json_schema":
+        try:
+            parser = JsonSchemaParser(response_format.json_schema.schema_)
+            setattr(parser, "schema", response_format.json_schema.schema_)
+            return parser
+        except Exception as exc:
+            raise ClientException("Invalid JSON schema in response_format") from exc
+    else:
+        raise ClientException("Unsupported response_format type; only 'json_schema' is supported")
+
+
+def _get_parser_for_json_schema(json_schema: Optional[Dict[str, Any]]) -> Optional[JsonSchemaParser]:
+    if json_schema is None:
+        return None
+    try:
+        return JsonSchemaParser(json_schema)
+    except Exception as exc:
+        raise ClientException("Invalid JSON schema") from exc
+
+
+def _send_usage_metrics(
+    handler: str,
+    prompt_token_num: int,
+    completion_token_num: int,
+    ttft_milliseconds: int = -1,
+    tpot_milliseconds: int = -1,
+) -> None:
     cms_prompt_tokens.labels(handler=handler).observe(prompt_token_num)
     logger.debug("Sent prompt tokens usage: %s", prompt_token_num)
     cms_completion_tokens.labels(handler=handler).observe(completion_token_num)
     logger.debug("Sent completion tokens usage: %s", completion_token_num)
     cms_total_tokens.labels(handler=handler).observe(prompt_token_num + completion_token_num)
     logger.debug("Sent total tokens usage: %s", prompt_token_num + completion_token_num)
+    if ttft_milliseconds != -1:
+        cms_ttft_milliseconds.labels(handler=handler).observe(ttft_milliseconds)
+        logger.debug("Sent time to first token: %s ms", ttft_milliseconds)
+    if tpot_milliseconds != -1:
+        cms_tpot_milliseconds.labels(handler=handler).observe(tpot_milliseconds)
+        logger.debug("Sent time per output token: %s ms", tpot_milliseconds)
+
+
+def _iso_utc_now() -> str:
+    return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime())
+
+
+def _normalise_stop_sequences(raw_stop: Union[None, str, List[str]]) -> List[str]:
+    if isinstance(raw_stop, str):
+        return [raw_stop]
+    if isinstance(raw_stop, list):
+        return [str(x) for x in raw_stop]
+    return []
diff --git a/app/api/routers/model_card.py b/app/api/routers/model_card.py
index ace6bad8..3c510019 100644
--- a/app/api/routers/model_card.py
+++ b/app/api/routers/model_card.py
@@ -4,11 +4,9 @@
 from app.domain import ModelCard, Tags
 from app.model_services.base import AbstractModelService
 from app.utils import get_settings
-from app.api.utils import get_rate_limiter
 
 router = APIRouter()
 config = get_settings()
-limiter = get_rate_limiter(config)
 
 assert cms_globals.props is not None, "Current active user dependency not injected"
 assert cms_globals.model_service_dep is not None, "Model service dependency not injected"
diff --git a/app/api/routers/preview.py b/app/api/routers/preview.py
index 99feb616..3406b18e 100644
--- a/app/api/routers/preview.py
+++ b/app/api/routers/preview.py
@@ -30,7 +30,7 @@
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Extract the NER entities in HTML for preview",
 )
-async def get_rendered_entities_from_text(
+def get_rendered_entities_from_text(
     request: Request,
     text: Annotated[str, Body(description="The text to be sent to the model for NER", media_type="text/plain")],
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
diff --git a/app/api/routers/stream.py b/app/api/routers/stream.py
index c44ecf82..2a95e7f3 100644
--- a/app/api/routers/stream.py
+++ b/app/api/routers/stream.py
@@ -8,9 +8,12 @@
 import app.api.globals as cms_globals
 
 from typing import Any, Mapping, Optional, AsyncGenerator
+from typing_extensions import Annotated
 from starlette.types import Receive, Scope, Send
 from starlette.background import BackgroundTask
-from fastapi import APIRouter, Depends, Request, Response, WebSocket, WebSocketException
+from starlette.status import HTTP_202_ACCEPTED
+from fastapi import APIRouter, Depends, Query, Request, Response, WebSocket, WebSocketException, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import ValidationError, BaseModel
 from app.domain import Tags, TextStreamItem
 from app.model_services.base import AbstractModelService
@@ -20,10 +23,16 @@
 
 PATH_STREAM_PROCESS = "/process"
 PATH_WS = "/ws"
+PATH_SSE_EVENTS = "/sse/events"
+PATH_SSE_PROCESS = "/sse/process"
+SSE_CONNECTION_TIMEOUT_SECONDS = 300
+SSE_CONNECTION_MAX_RETRIES = 10
+
 
 router = APIRouter()
 config = get_settings()
 limiter = get_rate_limiter(config)
+sse_clients: dict[str, asyncio.Queue] = {}
 logger = logging.getLogger("cms")
 
 assert cms_globals.props is not None, "Current active user dependency not injected"
@@ -63,7 +72,7 @@ async def get_entities_stream_from_jsonlines_stream(
     description="WebSocket info endpoint for real-time NER entity extraction. Use ws://host:port/stream/ws to establish an actual WebSocket connection.",
     include_in_schema=True,
 )
-async def get_inline_annotations_from_websocket_info() -> "_WebSocketInfo":
+async def get_inline_entities_from_websocket_info() -> "_WebSocketInfo":
     """
     Information about the WebSocket endpoint for real-time NER entity extraction.
 
@@ -72,9 +81,10 @@ async def get_inline_annotations_from_websocket_info() -> "_WebSocketInfo":
     """
     return _WebSocketInfo()
 
+
 @router.websocket(PATH_WS)
-# @limiter.limit(config.PROCESS_BULK_RATE_LIMIT)  # Not supported yet
-async def get_inline_annotations_from_websocket(
+@limiter.exempt
+async def get_inline_entities_from_websocket(
     websocket: WebSocket,
     user_manager: CmsUserManager = Depends(get_user_manager),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
@@ -99,12 +109,32 @@ async def get_inline_annotations_from_websocket(
     monitor_idle_task = None
     try:
         if get_settings().AUTH_USER_ENABLED == "true":
-            cookie = websocket.cookies.get("fastapiusersauth")
-            if cookie is None:
-                raise WebSocketException(code=WS_1008_POLICY_VIOLATION, reason="Authentication cookie not found")
-            user = await cms_globals.props.auth_backends[1].get_strategy().read_token(cookie, user_manager) # type: ignore
-            if not user or not user.is_active:
-                raise WebSocketException(code=WS_1008_POLICY_VIOLATION, reason="User not found or not active")
+            jwt_backend = cms_globals.props.auth_backends[0]    # type: ignore
+            cookie_backend = cms_globals.props.auth_backends[1] # type: ignore
+            auth_header = websocket.headers.get("Authorization", "")
+            cookie = websocket.cookies.get("fastapiusersauth", "")
+
+            if not auth_header and not cookie:
+                raise WebSocketException(
+                    code=WS_1008_POLICY_VIOLATION,
+                    reason="Authentication credentials not found (Bearer token or cookie required)",
+                )
+
+            user = None
+            try:
+                if auth_header:
+                    bearer_token = auth_header.split(" ", 1)[1].strip()
+                    user = await jwt_backend.get_strategy().read_token(bearer_token, user_manager)  # type: ignore
+                else:
+                    user = await cookie_backend.get_strategy().read_token(cookie, user_manager) # type: ignore
+            except Exception:
+                raise WebSocketException(
+                    code=WS_1008_POLICY_VIOLATION,
+                    reason="Invalid authentication credential)",
+                )
+            else:
+                if user is None or not user.is_active:
+                    raise WebSocketException(code=WS_1008_POLICY_VIOLATION, reason="User not found or not active")
 
         await websocket.accept()
 
@@ -146,6 +176,178 @@ async def _monitor_idle() -> None:
             logger.debug(str(e))
 
 
+@router.get(PATH_SSE_EVENTS)
+@limiter.exempt
+async def get_entities_stream_from_sse(
+    request: Request,
+    client_id: Annotated[str, Query(description="Unique client identifier for the SSE connection")],
+    keep_alive: Annotated[Optional[bool], Query(description="Whether to keep the conneciton alive after periods of inactivity")] = False,
+) -> StreamingResponse:
+    """
+    Server-Sent Events (SSE) endpoint to receive NER entities as stream events for a specific client.
+
+    Args:
+        request (Request): The request object.
+        client_id (str): The unique client identifier for the SSE connection.
+        keep_alive (Optional[bool]): Whether to keep the connection alive after periods of inactivity.
+
+    Returns:
+        StreamingResponse: A streaming response for the SSE connection.
+    """
+    if client_id in sse_clients and sse_clients[client_id] is not None:
+        queue = sse_clients[client_id]
+    else:
+        queue = asyncio.Queue()
+        sse_clients[client_id] = queue
+
+    async def event_generator() -> AsyncGenerator[str, None]:
+        try:
+            yield ": connected\n\n"
+
+            while True:
+                if await request.is_disconnected():
+                    break
+
+                try:
+                    logger.debug(f"Waiting for event for client {client_id}")
+                    event = await asyncio.wait_for(queue.get(), timeout=SSE_CONNECTION_TIMEOUT_SECONDS)
+
+                    if isinstance(event, dict) and event.get("_control") == "close":
+                        logger.debug(f"Closing SSE for client {client_id} as requested")
+                        break
+
+                    yield f"data: {json.dumps(event)}\n\n"
+                except asyncio.TimeoutError:
+                    if keep_alive:
+                        logger.debug(f"Sending keepalive for client {client_id} after timeout")
+                        yield ": keepalive\n\n"
+                        continue
+                    else:
+                        logger.debug(f"Timeout reached for client {client_id}, closing connection")
+                        break
+        except asyncio.CancelledError:
+            logger.debug(f"SSE connection for client {client_id} cancelled")
+        except Exception as e:
+            logger.error(f"SSE error for client {client_id}: {e}")
+            yield f"data: {json.dumps({'error': 'stream_error', 'message': str(e)})}\n\n"
+        finally:
+            sse_clients.pop(client_id, None)
+            logger.debug(f"SSE disconnected for client {client_id}")
+
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+@router.post(
+    PATH_SSE_PROCESS,
+    tags=[Tags.Annotations.name],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+)
+@limiter.exempt
+async def send_text_jsonlines_for_processing(
+    request: Request,
+    client_id: Annotated[str, Query(description="Unique client identifier for the SSE connection")],
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep),
+) -> JSONResponse:
+    """
+    Sends texts in the JSON Lines format for processing and extracted NER entities will be received via Server-Sent Events (SSE).
+
+    Args:
+        request (Request): The request object containing the texts in JSON Lines.
+        client_id (str): The unique client identifier for the SSE connection.
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A JSON response indicating the status of the request.
+    """
+    for _ in range(SSE_CONNECTION_MAX_RETRIES):
+        queue = sse_clients.get(client_id)
+        if queue:
+            break
+        await asyncio.sleep(0.1)
+    else:
+        raise HTTPException(status_code=400, detail="Client not connected. Please establish SSE connection first.")
+
+    async def process_text(queue: asyncio.Queue, doc_name: str, text: str) -> None:
+        try:
+            await queue.put({"status": "started", "doc_name": doc_name, "text": text})
+            annotations = await model_service.annotate_async(text)
+            await asyncio.sleep(0.1)
+            for anno in annotations:
+                anno.doc_name = doc_name
+                await queue.put({"type": "annotation", "data": anno.dict(exclude_none=True)})
+            await queue.put({"status": "completed", "doc_name": doc_name})
+        except asyncio.CancelledError:
+            logger.debug(f"Processing for document {doc_name} was cancelled")
+            raise
+        except Exception as e:
+            logger.error(f"Error processing document {doc_name}: {e}")
+            await queue.put({"status": "error", "error": str(e), "doc_name": doc_name})
+
+    tasks = []
+    buffer = ""
+    doc_idx = 0
+
+    try:
+        async for chunk in request.stream():
+            decoded = chunk.decode("utf-8")
+            if not decoded:
+                break
+            buffer += decoded
+
+            while "\n" in buffer:
+                newline_idx = buffer.index("\n")
+                line = buffer[:newline_idx]
+                buffer = buffer[newline_idx + 1:]
+
+                if line.strip():
+                    try:
+                        json_line_obj = json.loads(line)
+                        TextStreamItem(**json_line_obj)
+                        task = asyncio.create_task(
+                            process_text(
+                                queue,
+                                text=json_line_obj["text"],
+                                doc_name=json_line_obj.get("name", f"doc_{doc_idx}"),
+                            )
+                        )
+                        tasks.append(task)
+                    except json.JSONDecodeError as e:
+                        await queue.put({'status': 'error', 'error': f'Invalid JSON Line: {str(e)}', 'content': line})
+                    except ValidationError as e:
+                        await queue.put({'status': 'error', 'error': f'Invalid JSON properties: {str(e)}', 'content': line})
+                    finally:
+                        doc_idx += 1
+
+        if buffer.strip():
+            try:
+                json_line_obj = json.loads(buffer)
+                TextStreamItem(**json_line_obj)
+                task = asyncio.create_task(
+                    process_text(
+                        queue,
+                        text=json_line_obj["text"],
+                        doc_name=json_line_obj.get("name", f"doc_{doc_idx}"),
+                    )
+                )
+                tasks.append(task)
+            except (json.JSONDecodeError, ValidationError) as e:
+                await queue.put({'status': 'error', 'error': str(e), 'content': buffer})
+
+    finally:
+        if tasks:
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+        await queue.put({"status": "all_completed", "total_docs": doc_idx})
+
+    return JSONResponse(content={"status": "accepted", "total_docs": doc_idx}, status_code=HTTP_202_ACCEPTED)
+
 class _LocalStreamingResponse(Response):
 
     def __init__(
diff --git a/app/api/utils.py b/app/api/utils.py
index 7316f9f6..39421209 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -5,11 +5,13 @@
 import base64
 import contextlib
 import uuid
+import tempfile
 from functools import lru_cache
-from typing import Optional, AsyncGenerator
+from typing import Optional, AsyncGenerator, Dict, Any
 from typing_extensions import Annotated
 from fastapi import FastAPI, Request, APIRouter, Body, Query
 from starlette.responses import JSONResponse, StreamingResponse
+from starlette.types import Receive, Scope, Send
 from starlette.status import (
     HTTP_500_INTERNAL_SERVER_ERROR,
     HTTP_501_NOT_IMPLEMENTED,
@@ -25,19 +27,42 @@
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from fastapi_users.jwt import decode_jwt
+from app import __version__ as app_version
 from app.config import Settings
-from app.domain import TagsGenerative
+from app.domain import TagsGenerative, ModelCard, ModelType
+from app.processors.prompt_factory import PromptFactory
 from app.exception import (
     StartTrainingException,
     AnnotationException,
     ConfigurationException,
     ClientException,
     ExtraDependencyRequiredException,
+    GenerationException,
 )
+from app.utils import get_settings, has_turing_generation_gpu
 
 logger = logging.getLogger("cms")
 
 
+class ForwardedPrefixMiddleware:
+    def __init__(self, app: FastAPI) -> None:
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] == "http":
+            forwarded_prefix = next(
+                (
+                    v.decode("latin-1").strip()
+                    for k, v in scope.get("headers", [])
+                    if k == b"x-forwarded-prefix"
+                ),
+                None,
+            )
+            if forwarded_prefix:
+                scope["root_path"] = "/" + forwarded_prefix.strip("/")
+        await self.app(scope, receive, send)
+
+
 def add_exception_handlers(app: FastAPI) -> None:
     """
     Adds custom exception handlers to the FastAPI app instance.
@@ -157,6 +182,21 @@ async def client_exception_handler(_: Request, exception: ClientException) -> JS
         logger.exception(exception)
         return JSONResponse(status_code=HTTP_400_BAD_REQUEST, content={"message": str(exception)})
 
+    @app.exception_handler(GenerationException)
+    async def generation_exception_handler(_: Request, exception: GenerationException) -> JSONResponse:
+        """
+        Handles generation exceptions.
+
+        Args:
+            _ (Request): The request object.
+            exception (GenerationException): The generation exception.
+
+        Returns:
+            JSONResponse: A JSON response with a 500 status code and an error message.
+        """
+        logger.exception(exception)
+        return JSONResponse(status_code=HTTP_500_INTERNAL_SERVER_ERROR, content={"message": str(exception)})
+
     @app.exception_handler(Exception)
     async def unhandled_exception_handler(_: Request, exception: Exception) -> JSONResponse:
         """
@@ -295,48 +335,68 @@ def decrypt(b64_encoded: str, private_key_pem: str) -> str:
     )
     return decrypted.decode()
 
-async def init_vllm_engine(app: FastAPI,
-                           model_dir_path: str,
-                           model_name: str,
-                           log_level: str = "info") -> FastAPI:
+
+async def init_vllm_engine(
+    app: FastAPI,
+    config: Settings,
+    model_dir_path: str,
+    model_name: str,
+    log_level: str = "info",
+    server_args: Optional[str] = None,
+) -> FastAPI:
     """
     Initialises the vLLM engine.
 
     Args:
         app (FastAPI): The FastAPI app instance.
+        config (Settings): Configuration settings for the model service.
         model_dir_path (str): The path to the directory containing the model.
         model_name (str): The name of the model.
         log_level (str): The log level for the VLLM engine. Defaults to "info".
+        server_args (Optional[str]): The arguments to pass to the vLLM engine.
     """
 
     try:
-        # Import necessary vLLM components
-        from vllm.utils import FlexibleArgumentParser
+        from vllm.utils.argparse_utils import FlexibleArgumentParser
         from vllm.engine.arg_utils import AsyncEngineArgs
         from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
         from vllm.entrypoints.chat_utils import parse_chat_messages, apply_hf_chat_template
         from vllm.entrypoints.openai.api_server import (
             create_chat_completion,
+            create_completion,
             show_available_models,
+            show_version,
             build_async_engine_client_from_engine_args,
             init_app_state,
         )
         from vllm import SamplingParams, TokensPrompt
     except ImportError:
-        logger.error("Cannot import the vLLM engine. Please install it with `pip install '.[llm]'`.")
-        raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install '.[llm]'`.")
+        logger.error("Cannot import the vLLM engine. Please install it with `pip install '.[vllm]'`.")
+        raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install '.[vllm]'`.")
 
     parser = FlexibleArgumentParser()
     parser = make_arg_parser(parser)
-    args = parser.parse_args([])
+    args = parser.parse_args(server_args.split() if server_args else [])
     validate_parsed_serve_args(args)
 
     args.model = model_dir_path
     args.dtype = "float16"
     args.served_model_name = [model_name]
-    args.max_model_len = 2048 # The default batched length (2048) needs to be higher than max_model_len.
-    # args.tokenizer = model_dir_path # Uncomment if your tokenizer is in a different path or needs explicit setting.
-    args.log_level = log_level
+    args.max_model_len = 2048
+    args.uvicorn_log_level = log_level
+    if hasattr(args, "chat_template") and config.OVERRIDE_CHAT_TEMPLATE:
+        tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".jinja", delete=False)
+        tmp.write(config.OVERRIDE_CHAT_TEMPLATE)
+        tmp.flush()
+        args.chat_template = tmp.name
+    if hasattr(args, "enable_auto_tool_choice"):
+        args.enable_auto_tool_choice = True
+    if hasattr(args, "tool_call_parser"):
+        args.tool_call_parser = "pythonic"
+    if hasattr(args, "return_tokens_as_token_ids"):
+        args.return_tokens_as_token_ids = True
+    if hasattr(args, "default_chat_template_kwargs"):
+        args.default_chat_template_kwargs = {"enable_thinking": False}
 
     exit_stack = contextlib.AsyncExitStack()
     engine = await exit_stack.enter_async_context(
@@ -345,38 +405,61 @@ async def init_vllm_engine(app: FastAPI,
             disable_frontend_multiprocessing=True,
         )
     )
+    app.state._vllm_exit_stack = exit_stack
+    app.state._vllm_engine = engine
 
     tokenizer = await engine.get_tokenizer()
-    vllm_config = await engine.get_vllm_config()    # type: ignore
-    model_config = await engine.get_model_config()  # type: ignore
-
-    await init_app_state(engine, vllm_config, app.state, args)  # type: ignore
+    model_config = getattr(engine, "model_config", None)
+    if model_config is None:
+        vllm_config = getattr(engine, "vllm_config", None)
+        model_config = getattr(vllm_config, "model_config", None)
+    await init_app_state(engine, app.state, args)
+
+    async def get_model_card() -> ModelCard:
+        return ModelCard(
+            model_description=model_name,
+            model_type=ModelType.HUGGINGFACE_LLM,
+            api_version=app_version,
+            model_card=_to_model_card_dict(model_config),
+        )
 
     async def generate_text(
         request: Request,
         prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
         max_tokens: Annotated[int, Query(description="The maximum number of tokens to generate", gt=0)] = 512
     ) -> StreamingResponse:
-        """
-        Custom endpoint for streaming text generation.
-        This endpoint takes a raw text prompt and streams back the generated text.
-        It applies a chat template to the prompt internally for model compatibility.
-        """
         messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
         params = SamplingParams(max_tokens=max_tokens)
 
-        conversation, _ = parse_chat_messages(messages, model_config, tokenizer, content_format="string")   # type: ignore
-        prompt_tokens = apply_hf_chat_template( # type: ignore
+        conversation, _, _ = parse_chat_messages(  # type: ignore
+            messages=messages,  # type: ignore[arg-type]
+            model_config=model_config,  # type: ignore[arg-type]
+            content_format="string",    # type: ignore[arg-type]
+        )
+        chat_template: Optional[str] = None
+        if args.chat_template:
+            chat_template = args.chat_template
+        else:
+            if getattr(tokenizer, "chat_template", None):  # type: ignore
+                chat_template = tokenizer.chat_template  # type: ignore
+            elif getattr(tokenizer, "default_chat_template", None):  # type: ignore
+                tokenizer.chat_template = tokenizer.default_chat_template  # type: ignore
+                chat_template = tokenizer.chat_template  # type: ignore
+            else:
+                chat_template = PromptFactory.create_chat_template()
+
+        prompt_text = apply_hf_chat_template(
             tokenizer,
             conversation=conversation,
             tools=None,
             add_generation_prompt=True,
             continue_final_message=False,
-            chat_template="{% for message in messages %}\n{% if message['role'] == 'user' %}\nUser: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}\nAssistant: {{ message['content'] }}\n{% endif %}\n{% endfor %}\nAssistant:",
-            tokenize=True,
+            model_config=model_config,  # type: ignore
+            chat_template=chat_template,
         )
-        prompt_obj = TokensPrompt(prompt_token_ids=prompt_tokens)   # type: ignore
+        prompt_token_ids = tokenizer(prompt_text, add_special_tokens=False).input_ids
+        prompt_obj = TokensPrompt(prompt_token_ids=prompt_token_ids)
 
         async def _stream() -> AsyncGenerator[bytes, None]:
             start = 0
@@ -389,9 +472,12 @@ async def _stream() -> AsyncGenerator[bytes, None]:
 
     router = APIRouter()
     endpoints = [
+        ["/info", get_model_card, ["GET"]],
         ["/generate", generate_text, ["POST"]],
-        ["/chat/completions", create_chat_completion, ["POST"]],
-        ["/models", show_available_models, ["GET"]],
+        ["/v1/chat/completions", create_chat_completion, ["POST"]],
+        ["/v1/completions", create_completion, ["POST"]],
+        ["/v1/models", show_available_models, ["GET"]],
+        ["/v1/version", show_version, ["GET"]],
     ]
 
     for route, endpoint, methods in endpoints:
@@ -405,3 +491,187 @@ async def _stream() -> AsyncGenerator[bytes, None]:
     app.include_router(router)
 
     return app
+
+
+async def init_sglang_engine(
+    app: FastAPI,
+    config: Settings,
+    model_dir_path: str,
+    model_name: str,
+    log_level: str = "info",
+    server_args: Optional[str] = None,
+) -> FastAPI:
+    """
+    Initialises the SGLang engine.
+
+    Args:
+        app (FastAPI): The FastAPI app instance.
+        config (Settings): Configuration settings for the model service.
+        model_dir_path (str): The path to the directory containing the model.
+        model_name (str): The name of the model.
+        log_level (str): The log level for the SGLang engine. Defaults to "info".
+        server_args (Optional[str]): The arguments to pass to the SGLang engine.
+    """
+
+    try:
+        from sglang.srt.entrypoints.engine import (
+            _launch_subprocesses,
+            init_tokenizer_manager,
+            run_detokenizer_process,
+            run_scheduler_process,
+        )
+        from fastapi import Depends
+        from sglang.srt.server_args import prepare_server_args
+        from sglang.srt.entrypoints.http_server import (
+            _GlobalState,
+            set_global_state,
+            generate_request,
+            openai_v1_completions,
+            openai_v1_chat_completions,
+            available_models,
+            validate_json_request,
+        )
+        from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+        from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
+        from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+        from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
+        from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
+        from sglang.srt.metrics.func_timer import enable_func_timer
+        from sglang.version import __version__ as sglang_version
+    except ImportError:
+        logger.error("Cannot import the SGLang engine. Please install it with `pip install '.[sglang]'`.")
+        raise ExtraDependencyRequiredException("Cannot import the SGLang engine. Please install it with `pip install '.[sglang]'`.")
+
+    add_api_key_middleware = None
+    add_prometheus_middleware = None
+    try:
+        from sglang.srt.utils import add_api_key_middleware, add_prometheus_middleware # type: ignore
+    except ImportError:
+        logger.warning(
+            "SGLang middleware helpers not available in this version; "
+            "API-key and Prometheus middleware setup will be skipped."
+        )
+
+    server_args = prepare_server_args((server_args.split() if server_args else []) + ["--model-path", model_dir_path])
+    server_args.served_model_name = model_name
+    server_args.log_level = log_level
+    server_args.log_level_http = log_level
+    server_args.tokenizer_worker_num = 1
+    server_args.skip_server_warmup = False
+    server_args.quantization = None # "bitsandbytes"
+    server_args.model_impl = "transformers"
+    server_args.mem_fraction_static = 0.9
+
+    if config.OVERRIDE_CHAT_TEMPLATE:
+        tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".jinja", delete=False)
+        tmp.write(config.OVERRIDE_CHAT_TEMPLATE)
+        tmp.flush()
+        server_args.chat_template = tmp.name
+
+    if has_turing_generation_gpu():
+        server_args.sampling_backend = "pytorch"
+        server_args.attention_backend = None if get_settings().ENABLE_SPDA_ATTN == "true" else "torch_native"
+        server_args.prefill_attention_backend =  None if get_settings().ENABLE_SPDA_ATTN == "true" else "torch_native"
+        server_args.decode_attention_backend =  None if get_settings().ENABLE_SPDA_ATTN == "true" else "torch_native"
+        server_args.disable_cuda_graph = True
+
+    result = _launch_subprocesses(
+        server_args=server_args,
+        init_tokenizer_manager_func=init_tokenizer_manager,
+        run_scheduler_process_func=run_scheduler_process,
+        run_detokenizer_process_func=run_detokenizer_process,
+    )
+
+    if len(result) == 4:
+        tokenizer_manager, template_manager, scheduler_infos, subprocess_watchdog = result
+        if not scheduler_infos:
+            raise ExtraDependencyRequiredException(
+                "SGLang engine started but scheduler_infos is empty; cannot build HTTP global state."
+            )
+        if tokenizer_manager is not None and subprocess_watchdog is not None:
+            tokenizer_manager._subprocess_watchdog = subprocess_watchdog
+    else:
+        raise ExtraDependencyRequiredException(
+            f"Unexpected _launch_subprocesses return length {len(result)}; expected 4."
+        )
+
+    set_global_state(
+        _GlobalState(
+            tokenizer_manager=tokenizer_manager,
+            template_manager=template_manager,
+            scheduler_info=scheduler_infos[0],
+        )
+    )
+
+    if server_args.api_key and add_api_key_middleware is not None:
+        add_api_key_middleware(app, server_args.api_key)
+    elif server_args.api_key:
+        logger.warning("SGLang API key middleware is unavailable in this version.")
+    if server_args.enable_metrics and add_prometheus_middleware is not None:
+        add_prometheus_middleware(app)
+        enable_func_timer()
+    elif server_args.enable_metrics:
+        logger.warning("SGLang Prometheus middleware is unavailable in this version.")
+
+    app.state.openai_serving_completion = OpenAIServingCompletion(tokenizer_manager, template_manager)
+    app.state.openai_serving_chat = OpenAIServingChat(tokenizer_manager, template_manager)
+    app.state.openai_serving_embedding = OpenAIServingEmbedding(tokenizer_manager, template_manager)
+    app.state.openai_serving_score = OpenAIServingScore(tokenizer_manager)
+    app.state.openai_serving_rerank = OpenAIServingRerank(tokenizer_manager)
+
+    async def get_model_card() -> ModelCard:
+        model_config = getattr(tokenizer_manager, "model_config", None)
+        return ModelCard(
+            model_description=model_name,
+            model_type=ModelType.HUGGINGFACE_LLM,
+            api_version=app_version,
+            model_card=_to_model_card_dict(model_config),
+        )
+
+    async def show_version() -> Dict[str, str]:
+        return {"version": sglang_version}
+
+    router = APIRouter()
+    endpoints = [
+        ["/info", get_model_card, ["GET"], None],
+        ["/generate", generate_request, ["POST"], None],
+        ["/v1/chat/completions", openai_v1_chat_completions, ["POST"], [Depends(validate_json_request)]],
+        ["/v1/completions", openai_v1_completions, ["POST"], [Depends(validate_json_request)]],
+        ["/v1/models", available_models, ["GET"], None],
+        ["/v1/version", show_version, ["GET"], None],
+    ]
+
+    for route, endpoint, methods, dependencies in endpoints:
+        router.add_api_route(
+            path=route,
+            endpoint=endpoint,
+            methods=methods,
+            dependencies=dependencies,
+            include_in_schema=True,
+            tags=[TagsGenerative.Generative.name],
+        )
+    app.include_router(router)
+
+    return app
+
+
+def _to_model_card_dict(model_config: Optional[Any]) -> Dict:
+    """
+    Converts a model config object into a serialisable dict when possible.
+    """
+
+    if model_config is None:
+        return {}
+    if hasattr(model_config, "to_dict"):
+        try:
+            return model_config.to_dict()  # type: ignore[no-any-return]
+        except Exception:
+            logger.exception("Failed to convert model config with to_dict().")
+    if isinstance(model_config, dict):
+        return model_config
+    if hasattr(model_config, "__dict__"):
+        try:
+            return dict(vars(model_config))
+        except Exception:
+            logger.exception("Failed to convert model config using vars().")
+    return {}
diff --git a/app/cli/cli.py b/app/cli/cli.py
index 8862e1f9..5d89121d 100644
--- a/app/cli/cli.py
+++ b/app/cli/cli.py
@@ -44,9 +44,10 @@
     get_stream_server,
     get_generative_server,
     get_vllm_server,
+    get_sglang_server,
     get_app_for_api_docs,
 )   # noqa
-from app.utils import get_settings, send_gelf_message, download_model_package, get_model_data_package_base_name  # noqa
+from app.utils import get_settings, send_gelf_message, download_model_package, quantize_and_save_model  # noqa
 from app.management.model_manager import ModelManager  # noqa
 from app.api.dependencies import ModelServiceDep, ModelManagerDep  # noqa
 from app.management.tracker_client import TrackerClient  # noqa
@@ -71,8 +72,12 @@ def serve_model(
     streamable: bool = typer.Option(False, help="Serve the streamable endpoints only"),
     device: Device = typer.Option(Device.DEFAULT.value, help="The device to serve the model on"),
     llm_engine: Optional[LlmEngine] = typer.Option(LlmEngine.CMS.value, help="The engine to use for text generation"),
+    llm_engine_args: Optional[str] = typer.Option(None, help="The arguments to pass to the LLM engine"),
     load_in_4bit: Optional[bool] = typer.Option(False, help="Load the model in 4-bit precision, used by 'huggingface_llm' models"),
     load_in_8bit: Optional[bool] = typer.Option(False, help="Load the model in 8-bit precision, used by 'huggingface_llm' models"),
+    with_sdpa: Optional[bool] = typer.Option(False, help="Attempt to use SPDA attention for 'huggingface_llm' model loading"),
+    assistant_model_path: Optional[str] = typer.Option("", help="The assistant model package for speculative decoding"),
+    chat_template: Optional[str] = typer.Option("", help="Override the chat template used for prompt formatting"),
     debug: Optional[bool] = typer.Option(None, help="Run in the debug mode"),
 ) -> None:
     """
@@ -89,9 +94,12 @@ def serve_model(
         model_name (Optional[str]): The optional string representation of the model name.
         streamable (bool): Serve the streamable endpoints only. Defaults to False.
         device (Device): The device to serve the model on. Defaults to Device.DEFAULT.
-        llm_engine (LlmEngine): The inference engine to use. Defaults to LlmEngine.CMS.
-        load_in_4bit (bool): Load the model in 4-bit precision, used by 'huggingface_llm' models. Defaults to False.
-        load_in_8bit (bool): Load the model in 8-bit precision, used by 'huggingface_llm' models. Defaults to False.
+        llm_engine (Optional[LlmEngine]): The inference engine to use. Defaults to LlmEngine.CMS.
+        llm_engine_args (Optional[str]): The arguments to pass to the LLM engine.
+        load_in_4bit (Optional[bool]): Load the model in 4-bit precision, used by 'huggingface_llm' models. Defaults to False.
+        load_in_8bit (Optional[bool]): Load the model in 8-bit precision, used by 'huggingface_llm' models. Defaults to False.
+        assistant_model_path (Optional[str]): The assistant model package for speculative decoding.
+        chat_template (Optional[str]): Override the chat template used for prompt formatting.
         debug (Optional[bool]): Run in debug mode if set to True.
     """
 
@@ -101,6 +109,9 @@ def serve_model(
     logger = _get_logger(debug, model_type, model_name)
     config = get_settings()
     config.DEVICE = device
+    config.ENABLE_SPDA_ATTN = "true" if with_sdpa else "false"
+    config.ASSISTANT_MODEL_FULL_PATH = assistant_model_path if assistant_model_path else ""
+    config.OVERRIDE_CHAT_TEMPLATE = chat_template if chat_template else ""
     if model_type in [
         ModelType.HUGGINGFACE_NER,
         ModelType.MEDCAT_DEID,
@@ -139,7 +150,7 @@ def serve_model(
                 logger.warning("Source and destination are the same model package file.")
                 pass
 
-    if llm_engine is not LlmEngine.VLLM:
+    if llm_engine not in [LlmEngine.VLLM, LlmEngine.SGLANG]:
         if model_path:
             model_service = model_service_dep()
             model_service.model_name = model_name
@@ -163,7 +174,16 @@ def serve_model(
                 config,
                 dst_model_path,
                 model_name,
-                log_level="debug" if debug else "info"
+                log_level="debug" if debug else "info",
+                server_args=llm_engine_args,
+            )
+        elif llm_engine == LlmEngine.SGLANG:
+            model_server_app = get_sglang_server(
+                config,
+                dst_model_path,
+                model_name,
+                log_level="debug" if debug else "info",
+                server_args=llm_engine_args,
             )
         else:
             logger.error("Unknown LLM engine: %s" % llm_engine)
@@ -451,9 +471,9 @@ def generate_api_doc_per_model(
     config.ENABLE_PREVIEWS_APIS = "true" if add_previews_apis else "false"
     config.AUTH_USER_ENABLED = "true" if add_user_authentication else "false"
 
-    model_service_dep = ModelServiceDep(model_type, config, model_name or model_type.value)
+    model_service_dep = ModelServiceDep(model_type, config, model_name or model_type)
     cms_globals.model_service_dep = model_service_dep
-    doc_name = f"{model_type.value}_model_apis.json"
+    doc_name = f"{model_name or model_type}_model_apis.json"
 
     if model_type == ModelType.HUGGINGFACE_LLM:
         app = get_generative_server(config)
@@ -476,6 +496,8 @@ def package_model(
     output_model_package: str = typer.Option("", help="The path where the model package will be saved, minus any format-specific extension, e.g., './model_packages/bert-base-cased'"),
     archive_format: ArchiveFormat = typer.Option(ArchiveFormat.ZIP.value, help="The archive format of the model package, e.g., 'zip' or 'gztar'"),
     remove_cached: bool = typer.Option(False, help="Whether to remove the downloaded cache after the model package is saved"),
+    load_in_4bit: bool = typer.Option(False, help="Whether to quantise the model in 4-bit precision"),
+    load_in_8bit: bool = typer.Option(False, help="Whether to quantise the model in 8-bit precision"),
 ) -> None:
     """
     Packages and saves a Hugging Face model into a specified archive format.
@@ -490,6 +512,8 @@ def package_model(
         output_model_package (str): The path where the model package will be saved, minus any format-specific extension, e.g., './model_packages/bert-base-cased'.
         archive_format (ArchiveFormat): The format of the archive for the model package, either 'zip' or 'gztar'. Defaults to 'zip'.
         remove_cached (bool): Whether to remove the downloaded cache after the model package is saved. Defaults to False.
+        load_in_4bit (bool): Whether to quantise the model in 4-bit precision. Defaults to False.
+        load_in_8bit (bool): Whether to quantise the model in 8-bit precision. Defaults to False.
     """
 
     if hf_repo_id == "" and cached_model_dir == "":
@@ -518,6 +542,13 @@ def package_model(
                         local_dir=tmp_dir,
                         local_dir_use_symlinks=False,
                     )
+                if load_in_4bit or load_in_8bit:
+                    download_path = quantize_and_save_model(
+                        hf_model_path=download_path,
+                        output_model_path=None,
+                        load_in_4bit=load_in_4bit,
+                        load_in_8bit=load_in_8bit,
+                    )
                 _make_archive_file(model_package_archive, archive_format.value, download_path)
         finally:
             if remove_cached and download_path:
@@ -600,7 +631,7 @@ def run_mcp_server(
     cms_base_url: str = typer.Option("http://127.0.0.1:8000", help="The base URL of the CMS API"),
     cms_api_key: str = typer.Option("Bearer", help="The API key for authenticating with the CMS API"),
     cms_mcp_api_keys: str = typer.Option("", help="Comma-separated API keys for authenticating CMS MCP clients"),
-    cms_mcp_oauth_enabled: Optional[bool] = typer.Option(None, help="Whether to enable OAuth2 authentication for MCP clients"),
+    cms_mcp_oauth_provider: str = typer.Option("", help="The OAuth2 provider to use ('github' or 'google')"),
     github_client_id: str = typer.Option("", help="The GitHub OAuth2 client ID"),
     github_client_secret: str = typer.Option("", help="The GitHub OAuth2 client secret"),
     google_client_id: str = typer.Option("", help="The Google OAuth2 client ID"),
@@ -620,11 +651,11 @@ def run_mcp_server(
         cms_base_url (str): The base URL of the CMS API endpoint. Defaults to "http://localhost:8000".
         cms_api_key (str): The API key for authenticating with the CMS API. Defaults to "Bearer".
         cms_mcp_api_keys (str): Comma-separated API keys for authenticating CMS MCP clients. Defaults to "".
-        cms_mcp_oauth_enabled (Optional[bool]): Whether to enable OAuth2 authentication for MCP clients. Defaults to None.
-        github_client_id (str): The GitHub OAuth2 client ID, required if cms_mcp_oauth_enabled is True. Defaults to "".
-        github_client_secret (str): The GitHub OAuth2 client secret, required if cms_mcp_oauth_enabled is True. Defaults to an "".
-        google_client_id (str): The Google OAuth2 client ID, required if cms_mcp_oauth_enabled is True. Defaults to an "".
-        google_client_secret (str): The Google OAuth2 client secret, required if cms_mcp_oauth_enabled is True. Defaults to an "".
+        cms_mcp_oauth_provider (Optional[str]): The OAuth2 provider to use ('github' or 'google'). Defaults to "".
+        github_client_id (str): The GitHub OAuth2 client ID, required if cms_mcp_oauth_provider is set. Defaults to "".
+        github_client_secret (str): The GitHub OAuth2 client secret, required if cms_mcp_oauth_provider is set. Defaults to an "".
+        google_client_id (str): The Google OAuth2 client ID, required if cms_mcp_oauth_provider is set. Defaults to an "".
+        google_client_secret (str): The Google OAuth2 client secret, required if cms_mcp_oauth_provider is set. Defaults to an "".
         debug (Optional[bool]): Run in debug mode if set to True.
     """
 
@@ -637,7 +668,7 @@ def run_mcp_server(
     os.environ["CMS_MCP_TRANSPORT"] = transport.lower()
     os.environ["CMS_API_KEY"] = cms_api_key
     os.environ["CMS_MCP_API_KEYS"] = cms_mcp_api_keys
-    os.environ["CMS_MCP_OAUTH_ENABLED"] = "true" if cms_mcp_oauth_enabled else "false"
+    os.environ["CMS_MCP_OAUTH_PROVIDER"] = cms_mcp_oauth_provider.lower()
     os.environ["GITHUB_CLIENT_ID"] = github_client_id
     os.environ["GITHUB_CLIENT_SECRET"] = github_client_secret
     os.environ["GOOGLE_CLIENT_ID"] = google_client_id
@@ -839,11 +870,11 @@ def _display_info_table(
     info_table.add_column(style="cyan", justify="left")
     info_table.add_column(style="dim", justify="left")
 
-    info_table.add_row("🤖", "Model Name:", model_name or "CMS model")
-    info_table.add_row("📦", "Model Type:", display_model_type)
-    info_table.add_row("📂", "Model Path:", model_path or mlflow_model_uri)
-    info_table.add_row("🔗", "Base URL:", server_url)
-    info_table.add_row("📚", "Docs:", f"{server_url}/docs")
+    info_table.add_row("🤖 ", "Model Name:", model_name or "CMS model")
+    info_table.add_row("📦 ", "Model Type:", display_model_type)
+    info_table.add_row("📂 ", "Model Path:", model_path or mlflow_model_uri)
+    info_table.add_row("🔗 ", "Base URL:", server_url)
+    info_table.add_row("📚 ", "Docs:", f"{server_url}/docs")
 
     panel_content = Group(
         Align.center(title_text),
@@ -906,7 +937,10 @@ def _get_logger(
         get_settings().DEBUG = "true" if debug else "false"
     if get_settings().DEBUG != "true":
         logging.getLogger().setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(logging.DEBUG)
     logger = logging.getLogger("cms")
+    logger.setLevel(logging.DEBUG if get_settings().DEBUG == "true" else logging.INFO)
 
     lrf = logging.getLogRecordFactory()
 
diff --git a/app/config.py b/app/config.py
index abd4e0cb..df020671 100644
--- a/app/config.py
+++ b/app/config.py
@@ -12,6 +12,7 @@ class Settings(BaseSettings):   # type: ignore
     DEVICE: str = "default"                           # the device literal, either "default", "cpu[:X]", "cuda[:X]" or "mps[:X]"
     INCLUDE_SPAN_TEXT: str = "false"                  # if "true", include the text of the entity in the NER output
     CONCAT_SIMILAR_ENTITIES: str = "true"             # if "true", merge adjacent entities of the same type into one span
+    CONFIDENCE_SCORE_THRESHOLD: float = 0.0            # the confidence score threshold for the NER output, between 0.0 and 1.0
     ENABLE_TRAINING_APIS: str = "false"               # if "true", enable the APIs for model training
     DISABLE_UNSUPERVISED_TRAINING: str = "false"      # if "true", disable the API for unsupervised training
     DISABLE_METACAT_TRAINING: str = "true"            # if "true", disable the API for metacat training
@@ -23,6 +24,8 @@ class Settings(BaseSettings):   # type: ignore
     SKIP_SAVE_TRAINING_DATASET: str = "true"          # if "true", the dataset used for training won't be saved
     PROCESS_RATE_LIMIT: str = "180/minute"            # the rate limit on the /process route
     PROCESS_BULK_RATE_LIMIT: str = "90/minute"        # the rate limit on the /process_bulk route
+    GENERATION_RATE_LIMIT: str = "10/minute"          # the rate limit on the text generation routes
+    GENERATION_TIMEOUT_SECONDS: int = 180             # the timeout in seconds on the text generation requests
     WS_IDLE_TIMEOUT_SECONDS: int = 60                 # the timeout in seconds on the WebSocket connection being idle
     TYPE_UNIQUE_ID_WHITELIST: str = ""                # the comma-separated TUIs used for filtering and if set to "", all TUIs are whitelisted
     AUTH_USER_ENABLED: str = "false"                  # if "true", enable user authentication on API access
@@ -34,11 +37,17 @@ class Settings(BaseSettings):   # type: ignore
     TRAINING_METRICS_LOGGING_INTERVAL: int = 5        # the number of steps after which training metrics will be collected
     TRAINING_SAFE_MODEL_SERIALISATION: str = "false"  # if "true", serialise the trained model using safe tensors
     TRAINING_CACHE_DIR: str = os.path.join(os.path.abspath(os.path.dirname(__file__)), "cms_cache")           # the directory to cache the intermediate files created during training
-    TRAINING_HF_TAGGING_SCHEME: str = "flat"          # the tagging scheme during the Hugging Face NER model training, either "flat", "iob" or "iobes"
-    HF_PIPELINE_AGGREGATION_STRATEGY: str = "simple"  # the strategy used for aggregating the predictions of the Hugging Face NER model
+    TRAINING_HF_NER_TAGGING_SCHEME: str = "flat"      # the tagging scheme during the Hugging Face NER model training, either "flat", "iob" or "iobes"
+    TRAINING_HF_NER_FROZEN_PARAM_NAMES: str = ""      # the comma-separated parameter names to freeze; supports special value "except_classifier" which freezes all parameters except the classification head
+    TRAINING_HF_NER_ENABLE_LORA: str = "false"        # if "true", wrap the supervised HuggingFace NER model with a LoRA adapter during training
+    HF_NER_AGGREGATION_STRATEGY: str = "simple"       # the strategy used for aggregating the predictions of the Hugging Face NER model
+    HF_NER_APPLY_VITERBI_DECODING: str = "false"      # if "true", apply Viterbi decoding for the Hugging Face NER model with "iobes" tagging scheme
+    HF_NER_BATCH_SIZE: int = 4                        # the batch size used for bulk processing of the Hugging Face NER model
     LOG_PER_CONCEPT_ACCURACIES: str = "false"         # if "true", per-concept accuracies will be exposed to the metrics scrapper. Switch this on with caution due to the potentially high number of concepts
     MEDCAT2_MAPPED_ONTOLOGIES: str = ""               # the comma-separated names of ontologies for MedCAT2 to map to
     ENABLE_SPDA_ATTN: str = "true"                    # if "true", attempt to use SPDA attention for HuggingFace LLM loading
+    ASSISTANT_MODEL_FULL_PATH: str = ""               # the full path to the assistant model package for speculative decoding
+    OVERRIDE_CHAT_TEMPLATE: str = ""                  # if set, override the chat template used for prompt formatting
     DEBUG: str = "false"                              # if "true", the debug mode is switched on
 
     class Config:
diff --git a/app/domain.py b/app/domain.py
index 7a03d1c5..7b9c1413 100644
--- a/app/domain.py
+++ b/app/domain.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional, Dict, Any, Union
+from typing import List, Optional, Dict, Any, Union, Literal
 
 from fastapi import HTTPException
 from starlette.status import HTTP_400_BAD_REQUEST
@@ -28,6 +28,7 @@ class Tags(str, Enum):
     Authentication = "Authenticate registered users"
     Generative = "Generate text based on the input prompt"
     OpenAICompatible = "Operations compatible with OpenAI APIs"
+    OllamaCompatible = "Operations compatible with Ollama APIs"
 
 
 class TagsStreamable(str, Enum):
@@ -113,8 +114,9 @@ class TrackerBackend(Enum):
 
 
 class LlmEngine(Enum):
-    CMS = "CMS"
-    VLLM = "vLLM"
+    CMS = "cms"
+    VLLM = "vllm"
+    SGLANG = "sglang"
 
 
 class LlmRole(Enum):
@@ -160,7 +162,7 @@ class TextWithAnnotations(BaseModel):
 
 class TextWithPublicKey(BaseModel):
     text: str = Field(description="The plain text to be sent to the model for NER and redaction")
-    public_key_pem: str = Field(description="the public PEM key used for encrypting detected spans")
+    public_key_pem: str = Field(description="The public PEM key used for encrypting detected spans")
 
 
 class TextStreamItem(BaseModel):
@@ -211,9 +213,58 @@ class PromptMessage(BaseModel):
     content: str = Field(description="The actual text of the message")
 
 
+class GenerationResult(BaseModel):
+    text: str = Field(..., description="The generated text content")
+    prompt_token_num: int = Field(..., description="The number of tokens in the prompt text")
+    completion_token_num: int = Field(..., description="The number of tokens in the generated text")
+    ttft_ms: int = Field(default=-1, description="Time to first token in milliseconds")
+    tpot_ms: int = Field(default=-1, description="Average time per output token in milliseconds")
+
+
+class OpenAIStreamOptions(BaseModel):
+    include_usage: Optional[bool] = Field(
+        default=False, description="Whether to include token usage in stream response"
+    )
+
+    class Config:
+        extra = "allow"
+
+
+class OpenAIJsonSchemaWrapper(BaseModel):
+    name: Optional[str] = Field(default=None, description="The optional schema name")
+    schema_: Dict[str, Any] = Field(..., alias="schema", description="The actual JSON schema definition")
+
+    class Config:
+        allow_population_by_field_name = True
+
+
+class OpenAIResponseFormat(BaseModel):
+    type: Literal["json_schema"] = Field(..., description="The response format type")
+    json_schema: OpenAIJsonSchemaWrapper = Field(..., description="The JSON schema wrapper")
+
+
+class OpenAIFunctionTool(BaseModel):
+    name: str = Field(..., description="The name of the function tool")
+    description: Optional[str] = Field(default=None, description="The description of the function tool")
+    parameters: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="The JSON schema for the function parameters",
+    )
+
+
+class OpenAITool(BaseModel):
+    type: str = Field(default="function", description="The type of the tool")
+    function: OpenAIFunctionTool = Field(..., description="The function tool definition")
+
+
 class OpenAIChatCompletionsRequest(BaseModel):
     messages: List[PromptMessage] = Field(..., description="A list of messages to be sent to the model")
-    stream: bool = Field(..., description="Whether to stream the response")
+    tools: Optional[List[OpenAITool]] = Field(default=None, description="A list of tools available to the model")
+    stream: Optional[bool] = Field(default=False, description="Whether to stream the response")
+    stream_options: Optional[OpenAIStreamOptions] = Field(
+        default=None,
+        description="The extra options for streaming when it's turned on",
+    )
     max_tokens: int = Field(512, description="The maximum number of tokens to generate", gt=0)
     model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
@@ -222,6 +273,10 @@ class OpenAIChatCompletionsRequest(BaseModel):
         default=None,
         description="The single sequence or the list of sequences used to stop the generation",
     )
+    response_format: Optional[OpenAIResponseFormat] = Field(
+        default=None,
+        description="The optional response format configuration for structured outputs",
+    )
 
 
 class OpenAIChatCompletionsResponse(BaseModel):
@@ -237,8 +292,12 @@ class OpenAIChatCompletionsResponse(BaseModel):
 
 
 class OpenAICompletionsRequest(BaseModel):
-    prompt: Union[str, List[str]] = Field(..., description="Prompt text or list of prompts")
-    stream: bool = Field(False, description="Whether to stream the response")
+    prompt: Union[str, List[str]] = Field(..., description="The prompt text or list of prompts")
+    stream: Optional[bool] = Field(default=False, description="Whether to stream the response")
+    stream_options: Optional[OpenAIStreamOptions] = Field(
+        default=None,
+        description="The extra options for streaming when it's turned on",
+    )
     max_tokens: int = Field(512, description="The maximum number of tokens to generate", gt=0)
     model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
@@ -262,11 +321,56 @@ class OpenAICompletionsResponse(BaseModel):
 
 
 class OpenAIEmbeddingsRequest(BaseModel):
-    input: Union[str, List[str]] = Field(..., description="Input text or list of texts to embed")
+    input: Union[str, List[str]] = Field(..., description="The input text or list of texts to embed")
     model: str = Field(..., description="The name of the model used for creating the embeddings")
 
 
 class OpenAIEmbeddingsResponse(BaseModel):
     object: str = Field(..., description="The type of the response")
-    data: List[Dict[str, Any]] = Field(..., description="List of embedding objects")
+    data: List[Dict[str, Any]] = Field(..., description="The list of embedding objects")
     model: str = Field(..., description="The name of the model used for creating the embeddings")
+
+
+class OllamaMessage(BaseModel):
+    role: str = Field(..., description="The message role")
+    content: str = Field(..., description="The message content")
+
+
+class OllamaRequestOptions(BaseModel):
+    num_predict: int = Field(512, description="The maximum number of tokens to generate", gt=0)
+    temperature: Optional[float] = Field(default=0.7, description="The sampling temperature")
+    top_p: Optional[float] = Field(default=0.9, description="The nucleus sampling top_p")
+    stop: Optional[Union[str, List[str]]] = Field(
+        default=None,
+        description="The single sequence or list of sequences used to stop generation",
+    )
+
+class OllamaGenerateRequest(BaseModel):
+    model: str = Field(..., description="The model name")
+    prompt: str = Field(..., description="The prompt text")
+    stream: Optional[bool] = Field(default=False, description="Whether to stream the response")
+    system: Optional[str] = Field(default=None, description="The system prompt")
+    suffix: Optional[str] = Field(default=None, description="The suffix text after model response")
+    format: Optional[Dict[str, Any]] = Field(default=None, description="The response format")
+    keep_alive: Optional[Union[str, int]] = Field(default=None, description="The model keep-alive duration")
+    options: Optional[OllamaRequestOptions] = Field(default=None, description="The Ollama options")
+
+
+class OllamaChatRequest(BaseModel):
+    model: str = Field(..., description="The model name")
+    messages: List[OllamaMessage] = Field(..., description="The conversation messages")
+    stream: Optional[bool] = Field(default=False, description="Whether to stream the response")
+    format: Optional[Dict[str, Any]] = Field(default=None, description="The response format")
+    keep_alive: Optional[Union[str, int]] = Field(default=None, description="The model keep-alive duration")
+    options: Optional[OllamaRequestOptions] = Field(default=None, description="The Ollama options")
+
+
+class OllamaShowRequest(BaseModel):
+    model: str = Field(..., description="The model name")
+    verbose: Optional[bool] = Field(default=False, description="Whether to return verbose model info")
+
+
+class OllamaEmbedRequest(BaseModel):
+    model: str = Field(..., description="The model name")
+    input: Union[str, List[str]] = Field(..., description="The input text or list of texts to embed")
+    keep_alive: Optional[Union[str, int]] = Field(default=None, description="The model keep-alive duration")
diff --git a/app/envs/.env b/app/envs/.env
index a04b51cf..3f9db16f 100644
--- a/app/envs/.env
+++ b/app/envs/.env
@@ -10,6 +10,9 @@ INCLUDE_SPAN_TEXT=false
 # If "true", merge adjacent entities of the same type into one span
 CONCAT_SIMILAR_ENTITIES=true
 
+# The confidence score threshold for the NER output, between 0.0 and 1.0
+CONFIDENCE_SCORE_THRESHOLD=0.0
+
 # If "true", enable the APIs for model training
 ENABLE_TRAINING_APIS=true
 
@@ -49,6 +52,12 @@ PROCESS_RATE_LIMIT=180/minute
 # The rate limit on the /process_bulk* route
 PROCESS_BULK_RATE_LIMIT=90/minute
 
+# The rate limit on the text generation routes
+GENERATION_RATE_LIMIT=10/minute
+
+# The timeout in seconds on the text generation requests
+GENERATION_TIMEOUT_SECONDS=180
+
 # If "true", enable user authentication on API access
 AUTH_USER_ENABLED=false
 
@@ -68,13 +77,26 @@ SYSTEM_METRICS_LOGGING_INTERVAL_SECONDS=30
 TRAINING_CONCEPT_ID_WHITELIST=
 
 # If "true", serialise the trained model using safe tensors
-TRAINING_SAFE_MODEL_SERIALISATION=false
+TRAINING_SAFE_MODEL_SERIALISATION=true
 
 # The strategy used for aggregating the predictions of the Hugging Face NER model
-HF_PIPELINE_AGGREGATION_STRATEGY=simple
+HF_NER_AGGREGATION_STRATEGY=simple
+
+# If "true", apply Viterbi decoding for the Hugging Face NER model with "iobes" tagging scheme
+HF_NER_APPLY_VITERBI_DECODING=false
+
+# The batch size used for bulk processing of the Hugging Face NER model
+HF_PIPELINE_BATCH_SIZE=4
 
 # The tagging scheme during the Hugging Face NER model training, either "flat", "iob" or "iobes"
-TRAINING_HF_TAGGING_SCHEME=flat
+TRAINING_HF_NER_TAGGING_SCHEME=flat
+
+# The comma-separated names of parameters to freeze during the Hugging Face NER model training
+# This supports special value "except_classifier" which freezes all parameters except the classification head
+TRAINING_HF_NER_FROZEN_PARAM_NAMES=
+
+# If "true", wrap the supervised HuggingFace NER model with a LoRA adapter during training
+TRAINING_HF_NER_ENABLE_LORA=false
 
 # The comma-separated names of ontologies for MedCAT2 to map to
 MEDCAT2_MAPPED_ONTOLOGIES=opcs4,icd10
@@ -82,5 +104,11 @@ MEDCAT2_MAPPED_ONTOLOGIES=opcs4,icd10
 # If "true", attempt to use SPDA attention for Hugging Face LLM loading
 ENABLE_SPDA_ATTN=true
 
+# The full path to the assistant model package for speculative decoding
+ASSISTANT_MODEL_FULL_PATH=
+
+# If set, override the chat template used for prompt formatting
+OVERRIDE_CHAT_TEMPLATE=
+
 # If "true", the debug mode is switched on
 DEBUG=false
diff --git a/app/exception.py b/app/exception.py
index ddba71b7..4685bcc7 100644
--- a/app/exception.py
+++ b/app/exception.py
@@ -36,3 +36,7 @@ class DeviceNotAvailableError(RuntimeError):
 
 class ExtraDependencyRequiredException(Exception):
     """An exception raised when an extra dependency is required but not found."""
+
+
+class GenerationException(Exception):
+    """An exception raised due to generation errors"""
diff --git a/app/management/model_manager.py b/app/management/model_manager.py
index 90e8b724..b84a5ac4 100644
--- a/app/management/model_manager.py
+++ b/app/management/model_manager.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from typing import Type, Optional, Dict, Any, List, Iterator, final, Union
 from pandas import DataFrame
-from mlflow.pyfunc import PythonModel, PythonModelContext
+from mlflow.pyfunc import PythonModel, PythonModelContext   # type: ignore
 from mlflow.models.signature import ModelSignature
 from mlflow.types import DataType, Schema, ColSpec
 from app.model_services.base import AbstractModelService
@@ -28,20 +28,20 @@ class ModelManager(PythonModel):
     """
 
     input_schema = Schema([
-        ColSpec(DataType.string, "name", optional=True),
-        ColSpec(DataType.string, "text"),
+        ColSpec(DataType.string, "name", required=False),
+        ColSpec(DataType.string, "text", required=True),
     ])
 
     output_schema = Schema([
-        ColSpec(DataType.string, "doc_name"),
-        ColSpec(DataType.integer, "start"),
-        ColSpec(DataType.integer, "end"),
-        ColSpec(DataType.string, "label_name"),
-        ColSpec(DataType.string, "label_id"),
-        ColSpec(DataType.string, "categories", optional=True),
-        ColSpec(DataType.float, "accuracy", optional=True),
-        ColSpec(DataType.string, "text", optional=True),
-        ColSpec(DataType.string, "meta_anns", optional=True)
+        ColSpec(DataType.string, "doc_name", required=True),
+        ColSpec(DataType.integer, "start", required=True),
+        ColSpec(DataType.integer, "end", required=True),
+        ColSpec(DataType.string, "label_name", required=True),
+        ColSpec(DataType.string, "label_id", required=True),
+        ColSpec(DataType.string, "categories", required=False),
+        ColSpec(DataType.float, "accuracy", required=False),
+        ColSpec(DataType.string, "text", required=False),
+        ColSpec(DataType.string, "meta_anns", required=False),
     ])
 
     def __init__(self, model_service_type: Type, config: Settings) -> None:
@@ -58,7 +58,6 @@ def __init__(self, model_service_type: Type, config: Settings) -> None:
         self._model_signature = ModelSignature(
             inputs=ModelManager.input_schema,
             outputs=ModelManager.output_schema,
-            params=None,
         )
 
     @property
@@ -114,6 +113,7 @@ def retrieve_model_service_from_uri(
         """
 
         model_manager = ModelManager.retrieve_python_model_from_uri(mlflow_model_uri, config)
+        assert hasattr(model_manager, "model_service"), "Model manager has no model service initialised."
         model_service = model_manager.model_service
         config.BASE_MODEL_FULL_PATH = mlflow_model_uri
         model_service._config = config
@@ -195,6 +195,19 @@ def get_pip_requirements_from_file() -> Union[List[str], str]:
         else:
             raise ManagedModelException("Cannot find pip requirements.")
 
+    def __getstate__(self) -> dict:
+        return {
+            "_model_service_type": self._model_service_type,
+            "_config": self._config,
+            "_model_signature": self._model_signature,
+        }
+
+    def __setstate__(self, state: dict) -> None:
+        self._model_service_type = state["_model_service_type"]
+        self._config = state["_config"]
+        self._model_signature = state["_model_signature"]
+        self._model_service = None
+
     def save_model(self, local_dir: str, model_path: str) -> None:
         """
         Saves the model with the specified path into a local directory.
diff --git a/app/management/prometheus_metrics.py b/app/management/prometheus_metrics.py
index 78c5698d..631c47f1 100644
--- a/app/management/prometheus_metrics.py
+++ b/app/management/prometheus_metrics.py
@@ -55,3 +55,17 @@
     "Number of tokens used in the prompt and the completion",
     ["handler"],
 )
+
+# The histogram metric to track Time To First Token (TTFT) in milliseconds
+cms_ttft_milliseconds = Histogram(
+    "cms_ttft_milliseconds",
+    "Time to first generated token in milliseconds",
+    ["handler"],
+)
+
+# The histogram metric to track Time Per Output Token (TPOT) in milliseconds
+cms_tpot_milliseconds = Histogram(
+    "cms_tpot_milliseconds",
+    "Average time per output token in milliseconds",
+    ["handler"],
+)
diff --git a/app/management/tracker_client.py b/app/management/tracker_client.py
index a63a18c4..62575eb7 100644
--- a/app/management/tracker_client.py
+++ b/app/management/tracker_client.py
@@ -70,12 +70,17 @@ def start_tracking(
         """
         experiment_name = TrackerClient.get_experiment_name(model_name, training_type)
         experiment_id = TrackerClient._get_experiment_id(experiment_name)
+
+        if mlflow.active_run() is not None:
+            logger.warning("Detected an active run that did not end properly, marking it as FAILED before starting a new one.")
+            mlflow.end_run(RunStatus.to_string(RunStatus.FAILED))
+
         try:
             active_run = mlflow.start_run(
                 experiment_id=experiment_id,
+                run_name=run_name,
                 tags={
                     MLFLOW_SOURCE_NAME: socket.gethostname(),
-                    "mlflow.runName": run_name,
                     "mlflow.note.content": description or "",
                     "training.input_data.filename": input_file_name,
                     "training.base_model.origin": base_model_original,
@@ -335,6 +340,17 @@ def log_document_size(num_of_docs: int) -> None:
 
         mlflow.set_tag("training.document.size", str(num_of_docs))
 
+    @staticmethod
+    def log_training_token_count(token_count: int) -> None:
+        """
+        Logs the total number of tokens in the training dataset as a tag.
+
+        Args:
+            token_count (int): The total number of tokens used for training.
+        """
+
+        mlflow.set_tag("training.token.count", str(token_count))
+
     @staticmethod
     def log_model_config(config: Dict[str, str]) -> None:
         """
diff --git a/app/mcp/oauth/oauth.py b/app/mcp/oauth/oauth.py
index ebd66ca9..ad7bed8a 100644
--- a/app/mcp/oauth/oauth.py
+++ b/app/mcp/oauth/oauth.py
@@ -4,7 +4,7 @@
 from typing import Optional, Dict
 from dataclasses import dataclass
 from datetime import datetime, timedelta
-from starlette.responses import HTMLResponse, RedirectResponse, Response
+from starlette.responses import HTMLResponse, RedirectResponse, Response, JSONResponse
 from starlette.templating import Jinja2Templates
 from starlette.requests import Request
 from starlette.routing import Route
@@ -270,6 +270,81 @@ async def get_valid_token(self, session_id: str) -> Optional[OAuthToken]:
         return token if not token.is_expired() else None
 
     def create_oauth_routes(self) -> list:
+        async def well_known_index(request: Request) -> Response:
+            base_url = str(request.base_url).rstrip("/")
+            return JSONResponse(
+                content={
+                    "oauth_authorization_server": f"{base_url}/.well-known/oauth-authorization-server",
+                    "oauth_protected_resource": f"{base_url}/.well-known/oauth-protected-resource",
+                    "openid_configuration": f"{base_url}/.well-known/openid-configuration",
+                }
+            )
+
+        async def well_known_index_options(request: Request) -> Response:
+            return Response(status_code=204)
+
+        async def oauth_authorize_root(request: Request) -> Response:
+            provider = request.query_params.get("provider")
+            if not provider:
+                provider = os.getenv("CMS_MCP_OAUTH_PROVIDER", "").strip().lower()
+            if not provider:
+                return HTMLResponse(content="<h1>Missing provider</h1>", status_code=400)
+            request.path_params["provider"] = provider
+            return await oauth_authorize(request)
+
+        async def oauth_authorization_metadata(request: Request) -> Response:
+            base_url = str(request.base_url).rstrip("/")
+            metadata = {
+                "issuer": base_url,
+                "authorization_endpoint": f"{base_url}/authorize",
+                "token_endpoint": f"{base_url}/token",
+                "response_types_supported": ["code"],
+                "grant_types_supported": ["authorization_code", "refresh_token"],
+            }
+            return JSONResponse(content=metadata)
+
+        async def oauth_authorization_options(request: Request) -> Response:
+            return Response(status_code=204)
+
+        async def oauth_protected_resource_metadata(request: Request) -> Response:
+            base_url = str(request.base_url).rstrip("/")
+            metadata = {
+                "resource": base_url,
+                "authorization_servers": [base_url],
+            }
+            return JSONResponse(content=metadata)
+
+        async def oauth_protected_resource_options(request: Request) -> Response:
+            return Response(status_code=204)
+
+        async def oauth_protected_resource_sse(request: Request) -> Response:
+            base_url = str(request.base_url).rstrip("/")
+            metadata = {
+                "resource": f"{base_url}/sse",
+                "authorization_servers": [base_url],
+            }
+            return JSONResponse(content=metadata)
+
+        async def oauth_protected_resource_sse_options(request: Request) -> Response:
+            return Response(status_code=204)
+
+        async def openid_configuration(request: Request) -> Response:
+            base_url = str(request.base_url).rstrip("/")
+            metadata = {
+                "issuer": base_url,
+                "authorization_endpoint": f"{base_url}/authorize",
+                "token_endpoint": f"{base_url}/token",
+                "response_types_supported": ["code"],
+                "grant_types_supported": ["authorization_code", "refresh_token"],
+            }
+            return JSONResponse(content=metadata)
+
+        async def openid_configuration_options(request: Request) -> Response:
+            return Response(status_code=204)
+
+        async def oauth_register_options(request: Request) -> Response:
+            return Response(status_code=204)
+
         async def oauth_login(request: Request) -> Response:
             return templates.TemplateResponse("login.html", {"request": request})
 
@@ -396,7 +471,23 @@ async def oauth_logout(request: Request) -> Response:
             return response
 
         return [
+            Route("/.well-known", well_known_index),
+            Route("/.well-known/", well_known_index),
+            Route("/.well-known", well_known_index_options, methods=["OPTIONS"]),
+            Route("/.well-known/", well_known_index_options, methods=["OPTIONS"]),
+            Route("/.well-known/oauth-authorization-server", oauth_authorization_metadata),
+            Route("/.well-known/oauth-authorization-server", oauth_authorization_options, methods=["OPTIONS"]),
+            Route("/.well-known/oauth-protected-resource", oauth_protected_resource_metadata),
+            Route("/.well-known/oauth-protected-resource", oauth_protected_resource_options, methods=["OPTIONS"]),
+            Route("/.well-known/oauth-protected-resource/sse", oauth_protected_resource_sse),
+            Route("/.well-known/oauth-protected-resource/sse", oauth_protected_resource_sse_options, methods=["OPTIONS"]),
+            Route("/.well-known/openid-configuration", openid_configuration),
+            Route("/.well-known/openid-configuration", openid_configuration_options, methods=["OPTIONS"]),
+            Route("/register", oauth_register_options, methods=["OPTIONS"]),
+            Route("/oauth/register", oauth_register_options, methods=["OPTIONS"]),
+            Route("/authorize", oauth_authorize_root),
             Route("/oauth/login", oauth_login),
+            Route("/oauth/authorize", oauth_authorize_root),
             Route("/oauth/authorize/{provider}", oauth_authorize),
             Route("/oauth/callback/{provider}", oauth_callback),
             Route("/oauth/status", oauth_status),
diff --git a/app/mcp/server.py b/app/mcp/server.py
index fa1b752a..0781faf1 100644
--- a/app/mcp/server.py
+++ b/app/mcp/server.py
@@ -118,13 +118,20 @@ def create_server() -> Starlette:
 
     routes = []
     middleware = []
-    oauth_enabled = os.environ.get("CMS_MCP_OAUTH_ENABLED", "false").lower() == "true"
+    oauth_enabled = os.environ.get("CMS_MCP_OAUTH_PROVIDER", "") != ""
 
     if oauth_enabled:
         try:
             base_url = f"http://{host}:{port}"
             oauth_manager = OAuthManager(base_url)
             oauth_routes = oauth_manager.create_oauth_routes()
+            for route in oauth_routes:
+                route_path = getattr(route, "path", str(route))
+                route_methods = getattr(route, "methods", None)
+                if route_methods:
+                    logger.debug(f"Mounted OAuth route: {route_path} [{', '.join(sorted(route_methods))}]")
+                else:
+                    logger.debug(f"Mounted OAuth route: {route_path}")
             routes.extend(oauth_routes)
 
             middleware.append(
@@ -132,11 +139,10 @@ def create_server() -> Starlette:
                     OAuthMiddleware,
                     oauth_manager=oauth_manager,
                     public_paths=[
+                        "/authorize",
+                        "/favicon.ico",
                         "/oauth/",
-                        "/docs",
-                        "/openapi.json",
-                        "/redoc",
-                        "/health",
+                        "/.well-known",
                         "/.well-known/",
                     ]
                 )
@@ -174,8 +180,11 @@ def main() -> None:
 
     app = create_server()
 
-    if os.environ.get("CMS_MCP_OAUTH_ENABLED", "false").lower() == "true" and os.environ.get("CMS_MCP_TRANSPORT") != TransportType.STREAMABLE_HTTP.value:
-        logger.info(f"OAuth login: http://{host}:{port}/oauth/login")
+    if all([
+        os.environ.get("CMS_MCP_OAUTH_PROVIDER", "") != "",
+        os.environ.get("CMS_MCP_TRANSPORT") != TransportType.STREAMABLE_HTTP.value
+    ]):
+        logger.info(f"Please log in via OAuth: http://{host}:{port}/oauth/login")
 
     uvicorn.run(
         app,
diff --git a/app/mcp/utils.py b/app/mcp/utils.py
index 84b5c23a..d8684159 100644
--- a/app/mcp/utils.py
+++ b/app/mcp/utils.py
@@ -15,7 +15,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Union[Any, Awaitable[Any]]:
             )
         else:
             api_key = None
-        api_keys_env = os.environ.get("CMS_MCP_API_KEYS")
+        api_keys_env = os.environ.get("CMS_MCP_API_KEYS")   # Use Redis or database for production
 
         if not api_keys_env:    # No API-key-based authentication required
             return tool_fn(*args, **kwargs)
diff --git a/app/model_services/base.py b/app/model_services/base.py
index a7b6323d..cb897b32 100644
--- a/app/model_services/base.py
+++ b/app/model_services/base.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, List, Iterable, Tuple, final, Optional, Generic, TypeVar, Protocol, AsyncIterable, Union
 from app.config import Settings
-from app.domain import ModelCard, Annotation
+from app.domain import ModelCard, Annotation, GenerationResult
 
 class _TrainerCommon(Protocol):
     """A protocol for defining the common properties and methods that trainers should implement."""
@@ -168,7 +168,7 @@ def init_model(self, *args: Any, **kwargs: Any) -> None:
 
         raise NotImplementedError
 
-    def generate(self, prompt: str, *args: Any, **kwargs: Any) -> str:
+    def generate(self, prompt: str, *args: Any, **kwargs: Any) -> GenerationResult:
         """
         Generates a text based on a given prompt.
 
@@ -178,7 +178,7 @@ def generate(self, prompt: str, *args: Any, **kwargs: Any) -> str:
             **kwargs (Any): Additional keyword arguments to be passed to this method.
 
         Returns:
-            srt: The string containing the generated text.
+            GenerationResult: The generation result object.
 
         Raises:
             NotImplementedError: If the method is not implemented by the subclass.
@@ -310,3 +310,9 @@ def get_tracker_client(self) -> Optional[Any]:
             return self._metacat_trainer.tracker_client
         else:
             return None
+
+    def shutdown(self) -> None:
+        """
+        Shuts down the model service and releases any resources held by it.
+        """
+        pass
diff --git a/app/model_services/huggingface_llm_model.py b/app/model_services/huggingface_llm_model.py
index a74fea4e..d9027c1c 100644
--- a/app/model_services/huggingface_llm_model.py
+++ b/app/model_services/huggingface_llm_model.py
@@ -1,10 +1,13 @@
+import asyncio
 import os
 import logging
 import time
+import hashlib
+import json
 import re
 import torch
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Tuple, Any, AsyncIterable, TextIO, Callable, Union
+from typing import Dict, List, Optional, Tuple, Any, AsyncIterable, TextIO, Callable, Union, TYPE_CHECKING
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -17,22 +20,29 @@
     StoppingCriteriaList,
 )
 from app import __version__ as app_version
-from app.exception import ConfigurationException
+from app.exception import ConfigurationException, GenerationException, ExtraDependencyRequiredException
 from app.model_services.base import AbstractModelService
 from app.trainers.huggingface_llm_trainer import HuggingFaceLlmSupervisedTrainer, HuggingFaceLlmUnsupervisedTrainer
-from app.domain import ModelCard, ModelType, Annotation, Device
+from app.domain import ModelCard, ModelType, Annotation, Device, GenerationResult
 from app.config import Settings
 from app.processors.data_batcher import MicroBatchScheduler
+from app.processors.prefix_cache import PrefixCache
 from app.utils import (
     get_settings,
     non_default_device_is_available,
     unpack_model_data_package,
     ensure_tensor_contiguity,
     get_model_data_package_base_name,
-    get_default_chat_template,
-    utilise_local_chat_template,
     ensure_pad_token,
+    dump_pydantic_object_to_dict,
+    extract_json_string,
+    has_turing_generation_gpu,
+    resolve_safe_max_model_length,
 )
+if TYPE_CHECKING:
+    from lmformatenforcer import JsonSchemaParser as JsonSchemaParserType
+else:
+    JsonSchemaParserType = Any
 
 logger = logging.getLogger("cms")
 
@@ -58,6 +68,13 @@ def __init__(
             model_name (Optional[str]): The name of the model. Defaults to None.
             base_model_file (Optional[str]): The model package file name. Defaults to None.
         """
+        try:
+            from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
+        except ImportError:
+            logger.error("Cannot import JsonSchemaParser. Please install it with `pip install '.[llm]'`.")
+            raise ExtraDependencyRequiredException(
+                "Cannot import JsonSchemaParser. Please install it with `pip install '.[llm]'`."
+            )
 
         super().__init__(config)
         self._config = config
@@ -68,10 +85,14 @@ def __init__(
         self._enable_trainer = enable_trainer if enable_trainer is not None else config.ENABLE_TRAINING_APIS == "true"
         self._model: PreTrainedModel = None
         self._tokenizer: PreTrainedTokenizerBase = None
+        self._assistant_model: Optional[PreTrainedModel] = None
+        self._assistant_tokenizer: Optional[PreTrainedTokenizerBase] = None
         self._whitelisted_tuis = set([tui.strip() for tui in config.TYPE_UNIQUE_ID_WHITELIST.split(",")])
         self._text_generator = ThreadPoolExecutor(max_workers=10)
         self._sentence_endings = ".。!！?？:：;；\n"
-        self._generation_timeout_secs = 180
+        self._generation_timeout_secs = config.GENERATION_TIMEOUT_SECONDS or 180
+        self._prefix_kv_cache = PrefixCache()
+        self._build_transformers_prefix_allowed_tokens_fn = build_transformers_prefix_allowed_tokens_fn
         self._micro_batch_scheduler = MicroBatchScheduler(
             process_batch_fn=self._process_batched_requests,
             batch_key_fn=lambda request: request["batch_key"],
@@ -86,6 +107,7 @@ def __init__(
         )
         self.model_name = model_name or "HuggingFace LLM model"
         self.is_quantised = False
+        self.digest = "sha256:" + hashlib.sha256(f"{model_name} {base_model_file}".encode()).hexdigest()
 
     @property
     def model(self) -> PreTrainedModel:
@@ -199,7 +221,7 @@ def load_model(
                         bnb_config = BitsAndBytesConfig(
                             load_in_4bit=True,
                             bnb_4bit_quant_type="nf4",
-                            bnb_4bit_compute_dtype=torch.bfloat16,
+                            bnb_4bit_compute_dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
                             bnb_4bit_use_double_quant=True,
                         )
                         if get_settings().DEVICE == Device.DEFAULT.value:
@@ -220,6 +242,7 @@ def load_model(
                     elif load_in_8bit:
                         bnb_config = BitsAndBytesConfig(
                             load_in_8bit=True,
+                            bnb_4bit_compute_dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
                             llm_int8_threshold=6.0,
                             llm_int8_enable_fp32_cpu_offload=False
                         )
@@ -245,16 +268,20 @@ def load_model(
                                 model_path=model_path,
                                 device_map="auto",
                                 low_cpu_mem_usage=True,
+                                dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
                             )
                         else:
                             model = HuggingFaceLlmModel._load_causal_lm(
                                 enable_sdpa_attn=enable_sdpa_attn,
                                 model_path=model_path,
                                 low_cpu_mem_usage=True,
+                                dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
                             )
                 ensure_tensor_contiguity(model)
                 tokenizer = AutoTokenizer.from_pretrained(
-                    model_path, model_max_length=model.config.max_position_embeddings, do_lower_case=False
+                    model_path,
+                    model_max_length=resolve_safe_max_model_length(model.config),
+                    do_lower_case=False,
                 )
                 ensure_pad_token(model, tokenizer)
                 logger.info("Model package loaded from %s", os.path.normpath(model_file_path))
@@ -291,6 +318,18 @@ def init_model(self,
             self._model, self._tokenizer = self.load_model(
                 self._model_pack_path, load_in_4bit=load_in_4bit, load_in_8bit=load_in_8bit
             )
+            if self._config.OVERRIDE_CHAT_TEMPLATE:
+                self._tokenizer.chat_template = self._config.OVERRIDE_CHAT_TEMPLATE
+
+            if self._config.ASSISTANT_MODEL_FULL_PATH:
+                assistant_model_path = (
+                    self._config.ASSISTANT_MODEL_FULL_PATH
+                    if os.path.isabs(self._config.ASSISTANT_MODEL_FULL_PATH)
+                    else os.path.join(self._model_parent_dir, self._config.ASSISTANT_MODEL_FULL_PATH)
+                )
+                self._assistant_model, self._assistant_tokenizer = self.load_model(
+                    assistant_model_path, load_in_4bit=load_in_4bit, load_in_8bit=load_in_8bit
+                )
 
             if (non_default_device_is_available(get_settings().DEVICE) and
                 not (
@@ -299,6 +338,19 @@ def init_model(self,
                 )
             ):
                 self._model.to(get_settings().DEVICE)
+            if self._assistant_model is not None:
+                if self._assistant_tokenizer is not None and self._assistant_tokenizer is not self._tokenizer:
+                    if getattr(self._assistant_tokenizer, "vocab_size", None) != getattr(self._tokenizer, "vocab_size", None):
+                        logger.warning("Assistant model tokenizer vocab_size differs from the main tokenizer")
+                if hasattr(self._assistant_model, "generation_config"):
+                    self._assistant_model.generation_config.assistant_confidence_threshold = 0.4
+                if (non_default_device_is_available(get_settings().DEVICE) and
+                    not (
+                        getattr(self._assistant_model, "is_loaded_in_8bit", False) or
+                        getattr(self._assistant_model, "is_loaded_in_4bit", False)
+                    )
+                ):
+                    self._assistant_model.to(get_settings().DEVICE)
             if self._enable_trainer:
                 self._supervised_trainer = HuggingFaceLlmSupervisedTrainer(self)
                 self._unsupervised_trainer = HuggingFaceLlmUnsupervisedTrainer(self)
@@ -323,7 +375,7 @@ def annotate(self, text: str) -> List[Annotation]:
     def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
         raise NotImplementedError("Batch annotation is not yet implemented for HuggingFace Generative models")
 
-    def close(self) -> None:
+    def shutdown(self) -> None:
         """Stops background workers owned by this model service."""
         try:
             self._micro_batch_scheduler.stop()
@@ -343,9 +395,13 @@ def generate(
         temperature: float = 0.7,
         top_p: float = 0.9,
         stop_sequences: Optional[List[str]] = None,
-        report_tokens: Optional[Callable[[str], None]] = None,
+        report_tokens: Optional[Callable[..., None]] = None,
         ensure_full_sentences: bool = False,
-    ) -> str:
+        json_schema_parser: Optional[JsonSchemaParserType] = None,
+        prefix_prompt: Optional[str] = None,
+        *args: Tuple,
+        **kwargs: Dict[str, Any],
+    ) -> GenerationResult:
         """
         Generates text based on the prompt.
 
@@ -359,22 +415,40 @@ def generate(
             stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
             report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
             ensure_full_sentences (bool): Whether to generate full sentences only. Defaults to False.
+            json_schema_parser (Optional[JsonSchemaParser]): The JSON schema parser for validating the generated text. Defaults to None.
+            prefix_prompt (Optional[str]): The prefix prompt to be used for generation. Defaults to None.
 
         Returns:
-            Any: The string containing the generated text.
+            GenerationResult: The generation result object.
         """
+        logger.debug("Prompt after chat template applied: %s", prompt[:200])
         max_tokens = max(min_tokens, max_tokens)
         request = {
             "prompt": prompt,
             "stop_sequences": stop_sequences,
             "ensure_full_sentences": ensure_full_sentences,
             "report_tokens": report_tokens,
-            "batch_key": (min_tokens, max_tokens, num_beams, temperature, top_p),
+            "json_schema_parser": json_schema_parser,
+            "prefix_prompt": prefix_prompt,
+            "batch_key": (
+                min_tokens,
+                max_tokens,
+                num_beams,
+                temperature,
+                top_p,
+                self._get_schema_hash(json_schema_parser),
+                (PrefixCache.key(prefix_prompt) if prefix_prompt else None),
+            ),
         }
         future = self._micro_batch_scheduler.submit(request)
-        generated_text = future.result()
+        try:
+            generation_result = future.result()
+        except Exception as e:
+            logger.error("Failed to generate text from the request")
+            logger.exception(e)
+            raise GenerationException(f"Failed to generate text from the request: {str(e)}") from e
         logger.debug("Response generation completed")
-        return generated_text
+        return generation_result
 
     async def generate_async(
         self,
@@ -385,9 +459,13 @@ async def generate_async(
         temperature: float = 0.7,
         top_p: float = 0.9,
         stop_sequences: Optional[List[str]] = None,
-        report_tokens: Optional[Callable[[str], None]] = None,
+        report_tokens: Optional[Callable[..., None]] = None,
         ensure_full_sentences: bool = False,
-    ) -> AsyncIterable:
+        json_schema_parser: Optional[JsonSchemaParserType] = None,
+        prefix_prompt: Optional[str] = None,
+        *args: Tuple,
+        **kwargs: Dict[str, Any],
+    ) -> AsyncIterable[Union[str, GenerationResult]]:
         """
         Asynchronously generates text stream based on the prompt.
 
@@ -401,15 +479,62 @@ async def generate_async(
             stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
             report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
             ensure_full_sentences (bool): Whether to generate full sentences only. Defaults to False.
+            json_schema_parser (Optional[JsonSchemaParser]): The JSON schema parser for validating the generated text. Defaults to None.
+            prefix_prompt  (Optional[str]): The prefix prompt to be used for generation. Defaults to None.
 
         Returns:
-            AsyncIterable: The stream containing the generated text.
+            AsyncIterable[Union[str, GenerationResult]]: The stream containing the generated text and the generation result object.
         """
 
         self.model.eval()
-        prompt_text = self._build_prompt_text(prompt)
-        inputs = self.tokenizer(prompt_text, add_special_tokens=False, return_tensors="pt")
-        inputs.to(self.model.device)
+        logger.debug("Prompt after chat template applied: %s", prompt[:200])
+        full_prompt_len = None
+        past_key_values = None
+        prefix_len = 0
+        prefix_text = prefix_prompt or ""
+        use_prefix_cache = bool(prefix_prompt) and prompt.startswith(prefix_text)
+        if use_prefix_cache:
+            prefix_entry = self._prefix_kv_cache.get_prefix_entry(
+                prefix_text,
+                self.model,
+                self.tokenizer,
+            )
+            suffix_text = prompt[len(prefix_text):]
+            if prefix_entry is None or not suffix_text:
+                use_prefix_cache = False
+            else:
+                inputs = self.tokenizer(suffix_text, add_special_tokens=False, return_tensors="pt")
+                inputs.to(self.model.device)
+                if inputs.input_ids.shape[1] == 0:
+                    use_prefix_cache = False
+                elif inputs.attention_mask.sum().item() == 0:
+                    use_prefix_cache = False
+                else:
+                    prefix_len = int(prefix_entry.input_ids.shape[1])
+                    full_prompt_len = prefix_len + int(inputs.attention_mask.sum().item())
+                    prefix_mask = torch.ones(
+                        (inputs.input_ids.shape[0], prefix_len),
+                        dtype=inputs.attention_mask.dtype,
+                        device=inputs.input_ids.device,
+                    )
+                    attention_mask = torch.cat([prefix_mask, inputs.attention_mask], dim=1)
+                    past_key_values = PrefixCache.expand_past_key_values(
+                        prefix_entry.past_key_values,
+                        inputs.input_ids.shape[0],
+                    )
+        prefix_len = prefix_len if use_prefix_cache else 0
+        if not use_prefix_cache:
+            inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
+            inputs.to(self.model.device)
+            attention_mask = inputs.attention_mask
+            full_prompt_len = int(inputs.attention_mask.sum().item())
+        inputs, attention_mask, _prompt_lens, full_prompt_len, past_key_values = self._ensure_non_empty_inputs(
+            prompt=prompt,
+            inputs=inputs,
+            attention_mask=attention_mask,
+            full_prompt_lens=full_prompt_len,
+            past_key_values=past_key_values,
+        )
 
         streamer = AsyncTextIteratorStreamer(
             self.tokenizer,
@@ -419,44 +544,76 @@ async def generate_async(
             clean_up_tokenization_spaces=True,
         )
         max_tokens = max(min_tokens, max_tokens)
+        use_constrained_tokens = json_schema_parser is not None
         generation_kwargs = dict(
             inputs=inputs.input_ids,
-            attention_mask=inputs.attention_mask,
+            attention_mask=attention_mask,
             streamer=streamer,
             min_new_tokens=min_tokens,
             max_new_tokens=max_tokens,
             use_cache=True,
             num_beams=num_beams,
-            do_sample=(num_beams == 1),
+            do_sample=(num_beams == 1 and not use_constrained_tokens),
             temperature=temperature,
             top_p=top_p,
-            repetition_penalty=1.2,
-            no_repeat_ngram_size=3,
+            top_k=0,
+            repetition_penalty=(1.0 if use_constrained_tokens else 1.2),
+            no_repeat_ngram_size=(0 if use_constrained_tokens else 3),
             pad_token_id=self.tokenizer.pad_token_id,
             stopping_criteria=StoppingCriteriaList([TimeoutCriteria(float(self._generation_timeout_secs))]),
         )
+        if past_key_values is not None:
+            generation_kwargs["past_key_values"] = past_key_values
+            cache_position = torch.arange(
+                prefix_len,
+                prefix_len + inputs.input_ids.shape[1],
+                device=inputs.input_ids.device,
+            )
+            generation_kwargs["cache_position"] = cache_position
+        if use_constrained_tokens:
+            generation_kwargs["prefix_allowed_tokens_fn"] = self._build_transformers_prefix_allowed_tokens_fn(
+                self.tokenizer,
+                json_schema_parser,
+            )
+        if self._assistant_model is not None:
+            generation_kwargs["assistant_model"] = self._assistant_model
+            generation_kwargs["assistant_confidence_threshold"] = 0.4
+            generation_kwargs["num_assistant_tokens"] = 5
+            if self._assistant_tokenizer is not None:
+                if self._assistant_tokenizer.vocab_size != self._tokenizer.vocab_size:  # type: ignore
+                    generation_kwargs["assistant_tokenizer"] = self._assistant_tokenizer
 
         try:
+            generation_start = time.monotonic()
+            ttft_milliseconds = -1.0
             _ = self._text_generator.submit(self.model.generate, **generation_kwargs)
             buffer = ""
             full_output = ""
 
+            output_is_formatted = use_constrained_tokens
             if not ensure_full_sentences:
                 async for content in streamer:
+                    if ttft_milliseconds == -1.0 and content:
+                        ttft_milliseconds = (time.monotonic() - generation_start) * 1000.0
                     prev_output = full_output
                     full_output += content
                     if stop_sequences:
                         for stop_seq in stop_sequences:
                             if stop_seq in full_output:
                                 remaining = full_output[len(prev_output):full_output.find(stop_seq)]
-                                if remaining:
+                                if remaining and not output_is_formatted:
                                     for out_chunk in self._split_stream_chunk(remaining):
+                                        await asyncio.sleep(0.1)
                                         yield out_chunk
                                 return
-                    for out_chunk in self._split_stream_chunk(content):
-                        yield out_chunk
+                    if not output_is_formatted:
+                        for out_chunk in self._split_stream_chunk(content):
+                            await asyncio.sleep(0.1)
+                            yield out_chunk
             else:
                 async for content in streamer:
+                    if ttft_milliseconds == -1.0 and content:
+                        ttft_milliseconds = (time.monotonic() - generation_start) * 1000.0
                     buffer += content
 
                     if stop_sequences:
@@ -464,7 +621,8 @@ async def generate_async(
                         for stop_sequence in stop_sequences:
                             if stop_sequence in buffer:
                                 remaining = buffer[:buffer.find(stop_sequence)]
-                                if remaining:
+                                if remaining and not output_is_formatted:
+                                    await asyncio.sleep(0.1)
                                     yield remaining
                                     full_output += remaining
                                 stop_triggered = True
@@ -482,22 +640,45 @@ async def generate_async(
                     if last_sentence_ending != -1:
                         new_sentences = buffer[:last_sentence_ending + 1]
                         buffer = buffer[last_sentence_ending + 1:]
-                        yield new_sentences
+                        if not output_is_formatted:
+                            await asyncio.sleep(0.1)
+                            yield new_sentences
                         full_output += new_sentences
 
+            if output_is_formatted:
+                yield extract_json_string(full_output)
+
+            logger.debug("Decoded raw output: %s",full_output[:200])
+            prompt_token_num = (
+                full_prompt_len
+                if isinstance(full_prompt_len, int)
+                else inputs.input_ids.shape[-1]
+            )
+            completion_token_num = self.tokenizer(
+                full_output,
+                add_special_tokens=False,
+                return_tensors="pt"
+            ).input_ids.shape[-1]
+            total_generation_ms = (time.monotonic() - generation_start) * 1000.0
+            tpot_milliseconds = total_generation_ms / float(completion_token_num) if completion_token_num > 0 else -1
             if report_tokens:
                 report_tokens(
-                    prompt_token_num=inputs.input_ids.shape[-1],    # type: ignore
-                    completion_token_num=self.tokenizer(    # type: ignore
-                        full_output,
-                        add_special_tokens=False,
-                        return_tensors="pt"
-                    ).input_ids.shape[-1],
+                    prompt_token_num=prompt_token_num,  # type: ignore
+                    completion_token_num=completion_token_num,  # type: ignore
+                    ttft_milliseconds=int(ttft_milliseconds),  # type: ignore
+                    tpot_milliseconds=int(tpot_milliseconds),  # type: ignore
                 )
+            yield GenerationResult(
+                text=full_output,
+                prompt_token_num=prompt_token_num,
+                completion_token_num=completion_token_num,
+                ttft_ms=int(ttft_milliseconds),
+                tpot_ms=int(tpot_milliseconds),
+            )
         except Exception as e:
             logger.error("An error occurred while generating the response")
             logger.exception(e)
-            return
+            raise GenerationException(f"Failed to generate text from the request: {str(e)}") from e
         finally:
             logger.debug("Chat response generation completed")
 
@@ -531,11 +712,10 @@ def create_embeddings(
             inputs = self.tokenizer(txt, add_special_tokens=False, truncation=False, padding=False)
             input_ids = inputs["input_ids"]
             attention_mask = inputs["attention_mask"]
-            window_size = max(self.model.config.max_position_embeddings - 2, 1)
-            stride = window_size
+            window_size = max(resolve_safe_max_model_length(self.model.config) - 2, 1)
             chunk_embeddings = []
 
-            for start in range(0, len(input_ids), stride):
+            for start in range(0, len(input_ids), window_size):
                 end = min(start + window_size, len(input_ids))
                 chunk_inputs = {
                     "input_ids": torch.tensor(
@@ -667,10 +847,14 @@ def _load_causal_lm(
         if enable_sdpa_attn:
             try:
                 fa2_kwargs = dict(kwargs)
-                fa2_kwargs.setdefault("dtype", torch.bfloat16)
+                fa2_kwargs.setdefault("dtype", torch.float16 if has_turing_generation_gpu() else torch.bfloat16)
+                torch.backends.cuda.enable_flash_sdp(True)
+                torch.backends.cuda.enable_mem_efficient_sdp(False)
+                torch.backends.cuda.enable_math_sdp(False)
                 return AutoModelForCausalLM.from_pretrained(
                     model_path,
                     attn_implementation="sdpa",
+                    dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
                     **fa2_kwargs,
                 )
             except Exception as e:
@@ -679,22 +863,13 @@ def _load_causal_lm(
                 )
         return AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
 
-    def _build_prompt_text(self, prompt: str) -> str:
-        if hasattr(self.tokenizer, "chat_template") and self.tokenizer.chat_template is None:
-            logger.warning("The tokenizer does not have a chat template. Using the default one.")
-            self.tokenizer.chat_template = get_default_chat_template()
-        else:
-            if utilise_local_chat_template(self.model.config.model_type, self.tokenizer):
-                logger.debug(
-                    "Chat template overwritten by the prompt factory for %s", self.model.config.model_type
-                )
-            else:
-                logger.debug(f"Found a chat template in the tokenizer:\n {self.tokenizer.chat_template}")
-        return self.tokenizer.apply_chat_template(
-            [{"role": "user", "content": prompt}],
-            tokenize=False,
-            add_generation_prompt=True,
-        )
+
+    @staticmethod
+    def _get_schema_hash(json_schema_parser: Optional[JsonSchemaParserType]) -> Optional[str]:
+        if json_schema_parser is None:
+            return None
+        schema_dict = dump_pydantic_object_to_dict(json_schema_parser.context.model_class)
+        return hashlib.sha256(json.dumps(schema_dict).encode("utf-8")).hexdigest()
 
     def _postprocess_generated_text(
             self,
@@ -757,48 +932,175 @@ def _split_stream_chunk(self, text: str, max_words_per_chunk: int = 4) -> List[s
     def _process_batched_requests(self, requests: List[Dict[str, Any]]) -> None:
         try:
             self.model.eval()
-            prompt_texts = [self._build_prompt_text(req["prompt"]) for req in requests]
-            inputs = self.tokenizer(prompt_texts, add_special_tokens=False, return_tensors="pt", padding=True)
-            inputs.to(self.model.device)
-
-            prompt_lens = [int(x) for x in inputs.attention_mask.sum(dim=1).tolist()]
-            min_tokens, max_tokens, num_beams, temperature, top_p = requests[0]["batch_key"]
+            prompt_texts = [req["prompt"] for req in requests]
+            prefix_prompt = requests[0].get("prefix_prompt")
+            prefix_text = prefix_prompt or ""
+            use_prefix_cache = bool(prefix_prompt) and all(
+                prompt.startswith(prefix_text) for prompt in prompt_texts
+            )
+            prefix_entry = None
+            suffix_inputs = None
+            prefix_len = 0
+
+            if use_prefix_cache:
+                prefix_entry = self._prefix_kv_cache.get_prefix_entry(
+                    prefix_text,
+                    self.model,
+                    self.tokenizer,
+                )
+                if prefix_entry is None:
+                    use_prefix_cache = False
+                else:
+                    assert prefix_entry is not None
+                    suffix_texts = [prompt[len(prefix_text):] for prompt in prompt_texts]
+                    if any(not suffix for suffix in suffix_texts):
+                        use_prefix_cache = False
+                    else:
+                        suffix_inputs = self.tokenizer(
+                            suffix_texts,
+                            add_special_tokens=False,
+                            return_tensors="pt",
+                            padding=True,
+                        )
+                        suffix_inputs.to(self.model.device)
+                        if any([
+                            suffix_inputs.input_ids.shape[1] == 0,
+                            (suffix_inputs.attention_mask.sum(dim=1) == 0).any(),
+                        ]):
+                            use_prefix_cache = False
+                        prefix_len = int(prefix_entry.input_ids.shape[1])
+
+            if not use_prefix_cache:
+                inputs = self.tokenizer(prompt_texts, add_special_tokens=False, return_tensors="pt", padding=True)
+                inputs.to(self.model.device)
+                prompt_lens = [int(x) for x in inputs.attention_mask.sum(dim=1).tolist()]
+                full_prompt_lens = prompt_lens
+                batch_input_ids = inputs.input_ids
+                attention_mask = inputs.attention_mask
+                past_key_values = None
+            else:
+                assert suffix_inputs is not None
+                suffix_lens = [int(x) for x in suffix_inputs.attention_mask.sum(dim=1).tolist()]
+                prompt_lens = suffix_lens
+                full_prompt_lens = [prefix_len + length for length in suffix_lens]
+                batch_input_ids = suffix_inputs.input_ids
+                prefix_mask = torch.ones(
+                    (batch_input_ids.shape[0], prefix_len),
+                    dtype=suffix_inputs.attention_mask.dtype,
+                    device=batch_input_ids.device,
+                )
+                attention_mask = torch.cat([prefix_mask, suffix_inputs.attention_mask], dim=1)
+                assert prefix_entry is not None
+                past_key_values = PrefixCache.expand_past_key_values(
+                    prefix_entry.past_key_values,
+                    batch_input_ids.shape[0],
+                )
+            prefix_len = prefix_len if use_prefix_cache else 0
+            (
+                batch_input_ids,
+                attention_mask,
+                new_prompt_lens,
+                new_full_prompt_lens,
+                past_key_values,
+            ) = self._ensure_non_empty_inputs(
+                prompt=prompt_texts,
+                inputs=batch_input_ids,
+                attention_mask=attention_mask,
+                prompt_lens=prompt_lens,
+                full_prompt_lens=full_prompt_lens,
+                past_key_values=past_key_values,
+            )
+            if new_prompt_lens is not None:
+                prompt_lens = new_prompt_lens
+            if isinstance(new_full_prompt_lens, list):
+                full_prompt_lens = new_full_prompt_lens
+            assert prompt_lens is not None
+            assert full_prompt_lens is not None
+            min_tokens, max_tokens, num_beams, temperature, top_p, _, _ = requests[0]["batch_key"]
+            json_schema_parser = requests[0].get("json_schema_parser")
+            use_constrained_tokens = json_schema_parser is not None
             generation_kwargs = dict(
-                inputs=inputs.input_ids,
-                attention_mask=inputs.attention_mask,
+                inputs=batch_input_ids,
+                attention_mask=attention_mask,
                 min_new_tokens=min_tokens,
                 max_new_tokens=max_tokens,
                 use_cache=True,
                 num_beams=num_beams,
-                do_sample=(num_beams == 1),
+                do_sample=(num_beams == 1 and not use_constrained_tokens),
                 temperature=temperature,
                 top_p=top_p,
-                repetition_penalty=1.2,
-                no_repeat_ngram_size=3,
+                top_k=0,
+                repetition_penalty=(1.0 if use_constrained_tokens else 1.2),
+                no_repeat_ngram_size=(0 if use_constrained_tokens else 3),
                 pad_token_id=self.tokenizer.pad_token_id,
                 stopping_criteria=StoppingCriteriaList([TimeoutCriteria(float(self._generation_timeout_secs))]),
             )
-
+            if past_key_values is not None:
+                generation_kwargs["past_key_values"] = past_key_values
+                cache_position = torch.arange(
+                    prefix_len,
+                    prefix_len + batch_input_ids.shape[1],
+                    device=batch_input_ids.device,
+                )
+                generation_kwargs["cache_position"] = cache_position
+            if use_constrained_tokens:
+                generation_kwargs["prefix_allowed_tokens_fn"] = self._build_transformers_prefix_allowed_tokens_fn(
+                    self.tokenizer,
+                    json_schema_parser,
+                )
+            if self._assistant_model is not None:
+                generation_kwargs["assistant_model"] = self._assistant_model
+                generation_kwargs["assistant_confidence_threshold"] = 0.4
+                generation_kwargs["num_assistant_tokens"] = 5
+                if self._assistant_tokenizer.vocab_size != self._tokenizer.vocab_size:  # type: ignore
+                    generation_kwargs["assistant_tokenizer"] = self._assistant_tokenizer
+
+            generation_start = time.monotonic()
             outputs = self.model.generate(**generation_kwargs)
+            total_generation_ms = (time.monotonic() - generation_start) * 1000.0
             for idx, req in enumerate(requests):
                 completion_ids = outputs[idx][prompt_lens[idx]:]
                 generated_text = self.tokenizer.decode(completion_ids, skip_special_tokens=True)
-                generated_text = self._postprocess_generated_text(
+                logger.debug("Decoded raw output (batched): %s",generated_text[:200])
+                if use_constrained_tokens:
+                    generated_text = extract_json_string(generated_text)
+                else:
+                    generated_text = self._postprocess_generated_text(
+                        generated_text,
+                        req["stop_sequences"],
+                        req["ensure_full_sentences"],
+                    )
+                prompt_token_num = full_prompt_lens[idx]
+                completion_token_num = self.tokenizer(
                     generated_text,
-                    req["stop_sequences"],
-                    req["ensure_full_sentences"],
+                    add_special_tokens=False,
+                    return_tensors="pt",
+                ).input_ids.shape[-1]
+                tpot_milliseconds = (
+                    total_generation_ms / completion_token_num
+                    if completion_token_num > 0
+                    else -1
+                )
+                ttft_milliseconds = (
+                    total_generation_ms - tpot_milliseconds * (completion_token_num - 1)
+                    if completion_token_num > 0
+                    else -1
                 )
                 if req["report_tokens"]:
                     req["report_tokens"](
-                        prompt_token_num=prompt_lens[idx],
-                        completion_token_num=self.tokenizer(
-                            generated_text,
-                            add_special_tokens=False,
-                            return_tensors="pt",
-                        ).input_ids.shape[-1],
+                        prompt_token_num=prompt_token_num,  # type: ignore
+                        completion_token_num=completion_token_num,  # type: ignore
+                        ttft_milliseconds=int(ttft_milliseconds),  # type: ignore
+                        tpot_milliseconds=int(tpot_milliseconds),  # type: ignore
                     )
                 if not req["future"].done():
-                    req["future"].set_result(generated_text)
+                    req["future"].set_result(GenerationResult(
+                        text=generated_text,
+                        prompt_token_num=prompt_token_num,
+                        completion_token_num=completion_token_num,
+                        ttft_ms=-1,
+                        tpot_ms=int(tpot_milliseconds),
+                    ))
         except Exception as e:
             logger.error("Batched generation failed")
             logger.exception(e)
@@ -806,11 +1108,56 @@ def _process_batched_requests(self, requests: List[Dict[str, Any]]) -> None:
                 if not req["future"].done():
                     req["future"].set_exception(e)
 
+    def _ensure_non_empty_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        inputs: Any,
+        attention_mask: torch.Tensor,
+        past_key_values: Any,
+        prompt_lens: Optional[List[int]] = None,
+        full_prompt_lens: Optional[Union[int, List[int]]] = None,
+    ) -> Tuple[Any, torch.Tensor, Optional[List[int]], Optional[Union[int, List[int]]], Any]:
+        if isinstance(prompt, list):
+            try:
+                input_len = int(inputs.shape[1])
+            except Exception:
+                input_len = 1
+            if input_len > 0:
+                try:
+                    zero_mask = attention_mask.sum(dim=1) == 0
+                    if hasattr(zero_mask, "any"):
+                        if not bool(zero_mask.any()):
+                            return inputs, attention_mask, prompt_lens, full_prompt_lens, past_key_values
+                    else:
+                        return inputs, attention_mask, prompt_lens, full_prompt_lens, past_key_values
+                except Exception:
+                    return inputs, attention_mask, prompt_lens, full_prompt_lens, past_key_values
+            rebuilt = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt", padding=True)
+            rebuilt.to(self.model.device)
+            prompt_lens = [int(x) for x in rebuilt.attention_mask.sum(dim=1).tolist()]
+            full_prompt_lens = prompt_lens
+            return rebuilt.input_ids, rebuilt.attention_mask, prompt_lens, full_prompt_lens, None
+        try:
+            input_len = int(inputs.input_ids.shape[1])
+        except Exception:
+            input_len = 1
+        if input_len > 0:
+            try:
+                if attention_mask.sum().item() > 0:
+                    return inputs, attention_mask, None, full_prompt_lens, past_key_values
+            except Exception:
+                return inputs, attention_mask, None, full_prompt_lens, past_key_values
+        rebuilt = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
+        rebuilt.to(self.model.device)
+        full_prompt_len = int(rebuilt.attention_mask.sum().item())
+        return rebuilt, rebuilt.attention_mask, None, full_prompt_len, None
+
 
 class TimeoutCriteria(StoppingCriteria):
     """Stop generation when the timeout is reached."""
 
     def __init__(self, timeout_in_secs: float) -> None:
+        self._timeout_in_secs = timeout_in_secs
         self._deadline = time.monotonic() + timeout_in_secs
 
     def __call__(
@@ -818,4 +1165,8 @@ def __call__(
         scores: torch.FloatTensor,
         **kwargs: Dict[str, Any]
     ) -> bool:
-        return time.monotonic() >= self._deadline
+        now = time.monotonic()
+        if now >= self._deadline:
+            logger.warning(f"Generation timed out after {str(self._timeout_in_secs)} seconds")
+            return True
+        return False
diff --git a/app/model_services/huggingface_ner_model.py b/app/model_services/huggingface_ner_model.py
index 9089ed87..dcd95539 100644
--- a/app/model_services/huggingface_ner_model.py
+++ b/app/model_services/huggingface_ner_model.py
@@ -2,7 +2,6 @@
 import logging
 import torch
 import pandas as pd
-
 from functools import partial
 from typing import Dict, List, Optional, Tuple, Any, TextIO, Union
 from transformers import (
@@ -27,8 +26,11 @@
     ensure_tensor_contiguity,
     get_model_data_package_base_name,
     load_pydantic_object_from_dict,
+    resolve_safe_max_model_length,
+    parse_label_into_id_and_name,
 )
 from app.processors.tagging import TagProcessor
+from app.processors.viterbi_decoder import ViterbiDecoder
 
 logger = logging.getLogger("cms")
 
@@ -43,7 +45,6 @@ def __init__(
         enable_trainer: Optional[bool] = None,
         model_name: Optional[str] = None,
         base_model_file: Optional[str] = None,
-        confidence_threshold: float = 0.7,
     ) -> None:
         """
         Initialises the HuggingFace NER model service with specified configurations.
@@ -54,7 +55,6 @@ def __init__(
             enable_trainer (Optional[bool]): The flag to enable or disable trainers. Defaults to None.
             model_name (Optional[str]): The name of the model. Defaults to None.
             base_model_file (Optional[str]): The model package file name. Defaults to None.
-            confidence_threshold (float): The threshold for the confidence score. Defaults to 0.7.
         """
 
         super().__init__(config)
@@ -65,9 +65,10 @@ def __init__(
         self._model: PreTrainedModel = None
         self._tokenizer: PreTrainedTokenizerBase = None
         self._ner_pipeline: Pipeline = None
+        self._viterbi_decoder: Optional[ViterbiDecoder] = None
         self._whitelisted_tuis = set([tui.strip() for tui in config.TYPE_UNIQUE_ID_WHITELIST.split(",")])
-        self._confidence_threshold = confidence_threshold
         self.model_name = model_name or "HuggingFace NER model"
+        torch.set_num_threads(1)
 
     @property
     def model(self) -> PreTrainedModel:
@@ -134,13 +135,14 @@ def from_model(cls, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase)
             task="ner",
             model=model_service.model,
             tokenizer=model_service.tokenizer,
-            stride=32,
-            aggregation_strategy=_config.HF_PIPELINE_AGGREGATION_STRATEGY,
+            stride=max(max(resolve_safe_max_model_length(model.config) - 2, 1) // 4, 1),
+            aggregation_strategy=_config.HF_NER_AGGREGATION_STRATEGY,
         )
         if non_default_device_is_available(_config.DEVICE):
             model_service._ner_pipeline = _pipeline(device=get_hf_pipeline_device_id(_config.DEVICE))
         else:
             model_service._ner_pipeline = _pipeline()
+        model_service._viterbi_decoder = ViterbiDecoder.from_id2label(model_service.model.config.id2label)
         return model_service
 
     @staticmethod
@@ -174,7 +176,7 @@ def load_model(model_file_path: str, *args: Tuple, **kwargs: Dict[str, Any]) ->
                 ensure_tensor_contiguity(model)
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_path,
-                    model_max_length=model.config.max_position_embeddings,
+                    model_max_length=max(resolve_safe_max_model_length(model.config) - 2, 1),
                     do_lower_case=False,
                 )
                 logger.info("Model package loaded from %s", os.path.normpath(model_file_path))
@@ -207,13 +209,14 @@ def init_model(self, *args: Any, **kwargs: Any) -> None:
                 task="ner",
                 model=self._model,
                 tokenizer=self._tokenizer,
-                stride=32,
-                aggregation_strategy=self._config.HF_PIPELINE_AGGREGATION_STRATEGY,
+                stride=max(max(resolve_safe_max_model_length(self._model.config) - 2, 1) // 4, 1),
+                aggregation_strategy=self._config.HF_NER_AGGREGATION_STRATEGY,
             )
             if non_default_device_is_available(get_settings().DEVICE):
                 self._ner_pipeline = _pipeline(device=get_hf_pipeline_device_id(get_settings().DEVICE))
             else:
                 self._ner_pipeline = _pipeline()
+            self._viterbi_decoder = ViterbiDecoder.from_id2label(self._model.config.id2label)
             if self._enable_trainer:
                 self._supervised_trainer = HuggingFaceNerSupervisedTrainer(self)
                 self._unsupervised_trainer = HuggingFaceNerUnsupervisedTrainer(self)
@@ -242,9 +245,17 @@ def annotate(self, text: str) -> List[Annotation]:
         Returns:
             List[Annotation]: A list of annotations containing the extracted named entities.
         """
-
-        if TaggingScheme(self._config.TRAINING_HF_TAGGING_SCHEME.lower()) == TaggingScheme.IOBES:
-            entities = self._ner_pipeline(text, aggregation_strategy="none")
+        tagging_scheme = TaggingScheme(self._config.TRAINING_HF_NER_TAGGING_SCHEME.lower())
+        if tagging_scheme in (TaggingScheme.IOBES, TaggingScheme.IOB):
+            if self._config.HF_NER_APPLY_VITERBI_DECODING == "true":
+                entities = self._ner_pipeline(text, aggregation_strategy="none", ignore_labels=[])
+                if self._viterbi_decoder is not None:
+                    entities = self._viterbi_decoder.apply_viterbi_to_hf_pipeline_output(entities, self._model.config.id2label)
+                    logger.info("Viterbi decoding applied")
+                else:
+                    logger.warning("Viterbi decoding requested but no Viterbi decoder was detected.")
+            else:
+                entities = self._ner_pipeline(text, aggregation_strategy="none")
         else:
             entities = self._ner_pipeline(text)
         df = pd.DataFrame(entities)
@@ -252,12 +263,13 @@ def annotate(self, text: str) -> List[Annotation]:
         if df.empty:
             columns = ["label_name", "label_id", "start", "end", "accuracy"]
             df = pd.DataFrame(columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns)
-        elif TaggingScheme(self._config.TRAINING_HF_TAGGING_SCHEME.lower()) == TaggingScheme.IOBES:
+        elif tagging_scheme in (TaggingScheme.IOBES, TaggingScheme.IOB):
             aggregated_entities = TagProcessor.aggregate_bioes_predictions(
                 df,
                 text,
                 self._config.INCLUDE_SPAN_TEXT == "true",
             )
+            logger.debug("Aggregation applied for tagging scheme: %s", tagging_scheme.value)
             df = pd.DataFrame(aggregated_entities)
             if df.empty:
                 columns = ["label_name", "label_id", "start", "end", "accuracy"]
@@ -265,21 +277,94 @@ def annotate(self, text: str) -> List[Annotation]:
                     columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns
                 )
             else:
-                df = df[df["accuracy"] >= self._confidence_threshold]
+                df = df[df["accuracy"] >= self._config.CONFIDENCE_SCORE_THRESHOLD]
         else:
             for idx, row in df.iterrows():
-                df.loc[idx, "label_id"] = row["entity_group"]
+                df.loc[idx, "label_id"], df.loc[idx, "label_name"] = parse_label_into_id_and_name(row["entity_group"])
                 if self._config.INCLUDE_SPAN_TEXT == "true":
                     df.loc[idx, "text"] = text[row["start"]:row["end"]]
 
-            df.rename(columns={"entity_group": "label_name", "score": "accuracy"}, inplace=True)
-            df = df[df["accuracy"] >= self._confidence_threshold]
+            df.rename(columns={"score": "accuracy"}, inplace=True)
+            del df["entity_group"]
+            df = df[df["accuracy"] >= self._config.CONFIDENCE_SCORE_THRESHOLD]
 
+        if not df.empty:
+            df = df[(df["start"].notna()) & (df["end"].notna()) & (df["start"] < df["end"])]
         records = df.to_dict("records")
         return [load_pydantic_object_from_dict(Annotation, record) for record in records]
 
     def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
-        raise NotImplementedError("Batch annotation is not yet implemented for HuggingFace NER models")
+        """
+        Annotates texts in batches and returns a list of lists of annotations.
+
+        Args:
+            texts (List[str]): The list of texts to be annotated.
+
+        Returns:
+            List[List[Annotation]]: A list where each element is a list of annotations containing the extracted named entities.
+        """
+        if not texts:
+            return []
+
+        batch_size = max(int(self._config.HF_NER_BATCH_SIZE), 1)
+
+        tagging_scheme = TaggingScheme(self._config.TRAINING_HF_NER_TAGGING_SCHEME.lower())
+        if tagging_scheme in (TaggingScheme.IOBES, TaggingScheme.IOB):
+            if self._config.HF_NER_APPLY_VITERBI_DECODING == "true":
+                batch_entities = self._ner_pipeline(
+                    texts,
+                    aggregation_strategy="none",
+                    batch_size=batch_size,
+                    ignore_labels=[],
+                )
+                if self._viterbi_decoder is not None:
+                    batch_entities = [
+                        self._viterbi_decoder.apply_viterbi_to_hf_pipeline_output(entities, self._model.config.id2label)
+                        for entities in batch_entities
+                    ]
+                    logger.info("Viterbi decoding (batch) applied")
+                else:
+                    logger.warning("Viterbi decoding requested but no Viterbi decoder was detected.")
+            else:
+                batch_entities = self._ner_pipeline(texts, aggregation_strategy="none", batch_size=batch_size)
+        else:
+            batch_entities = self._ner_pipeline(texts, batch_size=batch_size)
+
+        annotations_list: List[List[Annotation]] = []
+        for text, entities in zip(texts, batch_entities):
+            df = pd.DataFrame(entities)
+            if df.empty:
+                columns = ["label_name", "label_id", "start", "end", "accuracy"]
+                df = pd.DataFrame(columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns)
+            elif tagging_scheme in (TaggingScheme.IOBES, TaggingScheme.IOB):
+                aggregated_entities = TagProcessor.aggregate_bioes_predictions(
+                    df,
+                    text,
+                    self._config.INCLUDE_SPAN_TEXT == "true",
+                )
+                df = pd.DataFrame(aggregated_entities)
+                if df.empty:
+                    columns = ["label_name", "label_id", "start", "end", "accuracy"]
+                    df = pd.DataFrame(
+                        columns=(columns + ["text"]) if self._config.INCLUDE_SPAN_TEXT == "true" else columns
+                    )
+                else:
+                    df = df[df["accuracy"] >= self._config.CONFIDENCE_SCORE_THRESHOLD]
+            else:
+                for idx, row in df.iterrows():
+                    df.loc[idx, "label_id"] = row["entity_group"]
+                    if self._config.INCLUDE_SPAN_TEXT == "true":
+                        df.loc[idx, "text"] = text[row["start"]:row["end"]]
+
+                df.rename(columns={"entity_group": "label_name", "score": "accuracy"}, inplace=True)
+                df = df[df["accuracy"] >= self._config.CONFIDENCE_SCORE_THRESHOLD]
+
+            if not df.empty:
+                df = df[(df["start"].notna()) & (df["end"].notna()) & (df["start"] < df["end"])]
+            records = df.to_dict("records")
+            annotations_list.append([load_pydantic_object_from_dict(Annotation, record) for record in records])
+
+        return annotations_list
 
     def create_embeddings(
         self,
diff --git a/app/model_services/medcat_model.py b/app/model_services/medcat_model.py
index 800e68dc..7c374849 100644
--- a/app/model_services/medcat_model.py
+++ b/app/model_services/medcat_model.py
@@ -178,7 +178,9 @@ def annotate(self, text: str) -> List[Annotation]:
 
         assert self.model is not None, "Model is not initialised"
         doc = self.model.get_entities(text)
-        return [load_pydantic_object_from_dict(Annotation, record) for record in self.get_records_from_doc(doc)]
+        records = self.get_records_from_doc(doc)
+        records = [r for r in records if r.get("accuracy", 0.0) >= self._config.CONFIDENCE_SCORE_THRESHOLD]
+        return [load_pydantic_object_from_dict(Annotation, record) for record in records]
 
     def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
         """
@@ -202,8 +204,10 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
         docs = dict(sorted(docs.items(), key=lambda x: x[0]))
         annotations_list = []
         for _, doc in docs.items():
+            records = self.get_records_from_doc(doc) # type: ignore
+            records = [r for r in records if r.get("accuracy", 0.0) >= self._config.CONFIDENCE_SCORE_THRESHOLD]
             annotations_list.append([
-                load_pydantic_object_from_dict(Annotation, record) for record in self.get_records_from_doc(doc) # type: ignore
+                load_pydantic_object_from_dict(Annotation, record) for record in records
             ])
         return annotations_list
 
diff --git a/app/model_services/medcat_model_deid.py b/app/model_services/medcat_model_deid.py
index c401d7ec..f19a4fbe 100644
--- a/app/model_services/medcat_model_deid.py
+++ b/app/model_services/medcat_model_deid.py
@@ -92,6 +92,7 @@ def annotate(self, text: str) -> List[Annotation]:
                 entity["type_ids"] = ["PII"]
 
         records = self.get_records_from_doc({"entities": doc["entities"]})  # type: ignore
+        records = [r for r in records if r.get("accuracy", 0.0) >= self._config.CONFIDENCE_SCORE_THRESHOLD]
         return [load_pydantic_object_from_dict(Annotation, record) for record in records]
 
     def annotate_with_local_chunking(self, text: str) -> List[Annotation]:
@@ -158,6 +159,7 @@ def annotate_with_local_chunking(self, text: str) -> List[Annotation]:
         assert processed_char_len == (len(text) + leading_ws_len), f"{len(text) + leading_ws_len - processed_char_len} characters were not processed:\n{text}"
 
         records = self.get_records_from_doc({"entities": aggregated_entities})
+        records = [r for r in records if r.get("accuracy", 0.0) >= self._config.CONFIDENCE_SCORE_THRESHOLD]
         return [load_pydantic_object_from_dict(Annotation, record) for record in records]
 
     def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
@@ -178,8 +180,10 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
             for _, entity in entities["entities"].items():
                 entity = cast(Dict[str, Any], entity)
                 entity["type_ids"] = ["PII"]
+            records = self.get_records_from_doc(entities)    # type: ignore
+            records = [r for r in records if r.get("accuracy", 0.0) >= self._config.CONFIDENCE_SCORE_THRESHOLD]
             annotations_list.append([
-                load_pydantic_object_from_dict(Annotation, record) for record in self.get_records_from_doc(entities)    # type: ignore
+                load_pydantic_object_from_dict(Annotation, record) for record in records
             ])
 
         return annotations_list
@@ -298,7 +302,7 @@ def init_model(self, *args: Any, **kwargs: Any) -> None:
                     task="ner",
                     tokenizer=ner.tokenizer.hf_tokenizer,
                     device=get_hf_pipeline_device_id(self._config.DEVICE),
-                    aggregation_strategy=self._config.HF_PIPELINE_AGGREGATION_STRATEGY,
+                    aggregation_strategy=self._config.HF_NER_AGGREGATION_STRATEGY,
                 )
             else:
                 if self._config.DEVICE != "default":
diff --git a/app/processors/lora_adaptor.py b/app/processors/lora_adaptor.py
new file mode 100644
index 00000000..963243d1
--- /dev/null
+++ b/app/processors/lora_adaptor.py
@@ -0,0 +1,101 @@
+import logging
+from typing import List, Tuple, Optional, Dict, Any
+from transformers import PreTrainedModel
+from peft import LoraConfig, get_peft_model # type: ignore
+from peft.utils.constants import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+from app.exception import ManagedModelException
+
+logger = logging.getLogger("cms")
+
+
+class LoraAdaptor:
+
+    @staticmethod
+    def apply(
+        model: PreTrainedModel,
+        task_type: str,
+        target_modules: Optional[List[str]] = None,
+        r: int = 8,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+    ) -> Tuple[PreTrainedModel, Any]:
+        """
+        Applies LoRA adaptation to the given model.
+
+        Args:
+            model (PreTrainedModel): The model to apply LoRA adaptation to.
+            task_type (str): The type of task to apply LoRA adaptation for.
+            target_modules (Optional[List[str]]): The names of the modules to apply LoRA adaptation to.
+            r (int): The rank of the LoRA adaptation.
+            lora_alpha (int): The alpha parameter for the LoRA adaptation.
+            lora_dropout (float): The dropout rate for the LoRA adaptation.
+
+        Returns:
+            Tuple[PreTrainedModel, LoraConfig]: The adapted model and the LoRA configuration.
+        """
+        resolved_target_modules = target_modules or LoraAdaptor._get_target_modules_from_mapping(
+            model,
+            TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+        ) or LoraAdaptor._infer_target_modules(model)
+        if not resolved_target_modules:
+            raise ManagedModelException(
+                "Could not determine LoRA target modules from PEFT mapping or model module names."
+            )
+
+        lora_config = LoraConfig(
+            task_type=task_type,
+            r=r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            target_modules=resolved_target_modules,
+        )
+        try:
+            peft_model = get_peft_model(model, lora_config)
+        except Exception:
+            detected_target_modules = LoraAdaptor._infer_target_modules(model)
+            logger.warning(
+                "Cannot get the PEFT model with target modules %s; retrying with detected modules %s",
+                resolved_target_modules,
+                detected_target_modules,
+            )
+            lora_config = LoraConfig(
+                task_type=task_type,
+                r=r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                target_modules=detected_target_modules,
+            )
+            peft_model = get_peft_model(model, lora_config)
+
+        return peft_model, lora_config
+
+    @staticmethod
+    def _get_target_modules_from_mapping(
+        model: PreTrainedModel,
+        target_modules_mapping: Dict[str, List[str]],
+    ) -> List[str]:
+        model_type = getattr(getattr(model, "config", None), "model_type", None)
+        return list(target_modules_mapping.get(model_type, [])) if model_type else []
+
+    @classmethod
+    def _infer_target_modules(cls, model: PreTrainedModel) -> List[str]:
+        target_module_candidates = [
+            ["query", "key", "value"],
+            ["q_proj", "k_proj", "v_proj"],
+            ["q_lin", "k_lin", "v_lin"],
+            ["c_attn"],
+        ]
+        leaf_module_names = {
+            module_name.split(".")[-1]
+            for module_name, module in model.named_modules()
+            if module_name and len(list(module.children())) == 0
+        }
+        for candidate_group in target_module_candidates:
+            matched = [name for name in candidate_group if name in leaf_module_names]
+            if len(matched) == len(candidate_group):
+                return matched
+        for candidate_group in target_module_candidates:
+            matched = [name for name in candidate_group if name in leaf_module_names]
+            if matched:
+                return matched
+        return []
diff --git a/app/processors/metrics_collector.py b/app/processors/metrics_collector.py
index 84f74dac..108b9aa6 100644
--- a/app/processors/metrics_collector.py
+++ b/app/processors/metrics_collector.py
@@ -2,11 +2,12 @@
 import json
 import hashlib
 import pandas as pd
-from typing import Tuple, Dict, List, Set, Union, Optional, IO, Any
+from typing import Tuple, Dict, List, Set, Union, Optional, IO, Any, Iterator
 from collections import defaultdict
 from sklearn.metrics import cohen_kappa_score
 from tqdm.autonotebook import tqdm
 from app.model_services.base import AbstractModelService
+from app.domain import Annotation
 from app.exception import AnnotationException
 
 
@@ -73,30 +74,77 @@ def sanity_check_model_with_trainer_export(
         false_positives[project["id"]] = {}
         false_negatives[project["id"]] = {}
 
-        for document in tqdm(documents, desc="Evaluating documents", total=len(documents), leave=False):
-            true_positives[project["id"]][document["id"]] = {}
-            false_positives[project["id"]][document["id"]] = {}
-            false_negatives[project["id"]][document["id"]] = {}
-
-            annotations = model_service.annotate(document["text"])
-            predictions[document["id"]] = []
-            for annotation in annotations:
-                predictions[document["id"]].append([annotation.start, annotation.end, annotation.label_id])
-                concept_names[annotation.label_id] = annotation.label_name
-                concept_anchors[annotation.label_id] = concept_anchors.get(annotation.label_id, [])
-                concept_anchors[annotation.label_id].append(f"P{project['id']}/D{document['id']}/S{annotation.start}/E{ annotation.end}")
-
-            predicted = {tuple(x) for x in predictions[document["id"]]}
-            actual = {tuple(x) for x in correct_cuis[project["id"]][document["id"]]}
-            doc_tps = list(predicted.intersection(actual))
-            doc_fps = list(predicted.difference(actual))
-            doc_fns = list(actual.difference(predicted))
-            true_positives[project["id"]][document["id"]] = doc_tps
-            false_positives[project["id"]][document["id"]] = doc_fps
-            false_negatives[project["id"]][document["id"]] = doc_fns
-            true_positive_count += len(doc_tps)
-            false_positive_count += len(doc_fps)
-            false_negative_count += len(doc_fns)
+        texts = [document["text"] for document in documents]
+        use_batch_annotate = False
+        annotations_iter: Optional[Iterator] = None
+        try:
+            batch_probe = model_service.batch_annotate(texts[:1])
+            if (
+                batch_probe
+                and isinstance(batch_probe, list)
+                and len(batch_probe) > 0
+                and isinstance(batch_probe[0], Annotation)
+            ):
+                annotations_iter = iter(model_service.batch_annotate(texts))
+                use_batch_annotate = True
+        except NotImplementedError:
+            use_batch_annotate = False
+
+        if use_batch_annotate:
+            assert annotations_iter is not None
+            for document in documents:
+                true_positives[project["id"]][document["id"]] = {}
+                false_positives[project["id"]][document["id"]] = {}
+                false_negatives[project["id"]][document["id"]] = {}
+
+                annotations = next(annotations_iter)
+                predictions[document["id"]] = []
+                for annotation in annotations:
+                    predictions[document["id"]].append([annotation.start, annotation.end, annotation.label_id])
+                    concept_names[annotation.label_id] = annotation.label_name
+                    concept_anchors[annotation.label_id] = concept_anchors.get(annotation.label_id, [])
+                    concept_anchors[annotation.label_id].append(
+                        f"P{project['id']}/D{document['id']}/S{annotation.start}/E{ annotation.end}"
+                    )
+
+                predicted = {tuple(x) for x in predictions[document["id"]]}
+                actual = {tuple(x) for x in correct_cuis[project["id"]][document["id"]]}
+                doc_tps = list(predicted.intersection(actual))
+                doc_fps = list(predicted.difference(actual))
+                doc_fns = list(actual.difference(predicted))
+                true_positives[project["id"]][document["id"]] = doc_tps
+                false_positives[project["id"]][document["id"]] = doc_fps
+                false_negatives[project["id"]][document["id"]] = doc_fns
+                true_positive_count += len(doc_tps)
+                false_positive_count += len(doc_fps)
+                false_negative_count += len(doc_fns)
+        else:
+            for document in tqdm(documents, desc="Evaluating documents", total=len(documents), leave=False):
+                true_positives[project["id"]][document["id"]] = {}
+                false_positives[project["id"]][document["id"]] = {}
+                false_negatives[project["id"]][document["id"]] = {}
+
+                annotations = model_service.annotate(document["text"])
+                predictions[document["id"]] = []
+                for annotation in annotations:
+                    predictions[document["id"]].append([annotation.start, annotation.end, annotation.label_id])
+                    concept_names[annotation.label_id] = annotation.label_name
+                    concept_anchors[annotation.label_id] = concept_anchors.get(annotation.label_id, [])
+                    concept_anchors[annotation.label_id].append(
+                        f"P{project['id']}/D{document['id']}/S{annotation.start}/E{ annotation.end}"
+                    )
+
+                predicted = {tuple(x) for x in predictions[document["id"]]}
+                actual = {tuple(x) for x in correct_cuis[project["id"]][document["id"]]}
+                doc_tps = list(predicted.intersection(actual))
+                doc_fps = list(predicted.difference(actual))
+                doc_fns = list(actual.difference(predicted))
+                true_positives[project["id"]][document["id"]] = doc_tps
+                false_positives[project["id"]][document["id"]] = doc_fps
+                false_negatives[project["id"]][document["id"]] = doc_fns
+                true_positive_count += len(doc_tps)
+                false_positive_count += len(doc_fps)
+                false_negative_count += len(doc_fns)
 
     precision = true_positive_count / (true_positive_count + false_positive_count) if (true_positive_count + false_positive_count) != 0 else 0
     recall = true_positive_count / (true_positive_count + false_negative_count) if (true_positive_count + false_negative_count) != 0 else 0
@@ -126,12 +174,19 @@ def sanity_check_model_with_trainer_export(
             for span in spans:
                 tp_counts[span[2]] += 1
 
-    for cui in tp_counts.keys():
-        per_cui_prec[cui] = tp_counts[cui] / (tp_counts[cui] + fp_counts[cui])
-        per_cui_rec[cui] = tp_counts[cui] / (tp_counts[cui] + fn_counts[cui])
-        per_cui_f1[cui] = 2*(per_cui_prec[cui]*per_cui_rec[cui]) / (per_cui_prec[cui] + per_cui_rec[cui])
-        per_cui_name[cui] = concept_names[cui]
-        per_cui_anchors[cui] = ANCHOR_DELIMITER.join(concept_anchors[cui])
+    all_cuis = set(tp_counts.keys()) | set(fp_counts.keys()) | set(fn_counts.keys())
+    for cui in all_cuis:
+        tp = tp_counts.get(cui, 0)
+        fp = fp_counts.get(cui, 0)
+        fn = fn_counts.get(cui, 0)
+        tp_fp_sum = tp + fp
+        tp_fn_sum = tp + fn
+        per_cui_prec[cui] = tp / tp_fp_sum if tp_fp_sum > 0 else 0
+        per_cui_rec[cui] = tp / tp_fn_sum if tp_fn_sum > 0 else 0
+        prec_rec_sum = per_cui_prec[cui] + per_cui_rec[cui]
+        per_cui_f1[cui] = 2 * (per_cui_prec[cui] * per_cui_rec[cui]) / prec_rec_sum if prec_rec_sum > 0 else 0
+        per_cui_name[cui] = concept_names.get(cui, cui)
+        per_cui_anchors[cui] = ANCHOR_DELIMITER.join(concept_anchors.get(cui, []))
 
     if return_df:
         df = pd.DataFrame({
diff --git a/app/processors/prefix_cache.py b/app/processors/prefix_cache.py
new file mode 100644
index 00000000..efe375ce
--- /dev/null
+++ b/app/processors/prefix_cache.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+import hashlib
+
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+
+@dataclass
+class PrefixCacheEntry:
+    input_ids: torch.LongTensor
+    attention_mask: torch.LongTensor
+    past_key_values: Any
+    device: torch.device
+
+
+class PrefixCache:
+    def __init__(self, max_entries: int = 10) -> None:
+        self._max_entries = max_entries
+        self._cache: "OrderedDict[str, PrefixCacheEntry]" = OrderedDict()
+
+    @staticmethod
+    def key(prefix_prompt: str) -> str:
+        """
+        Creates a hash key for the prefix prompt
+
+        Args:
+            prefix_prompt (str): The prefix prompt to hash.
+
+        Returns:
+            str: The hash key for the prefix prompt.
+        """
+        return hashlib.sha256(prefix_prompt.encode("utf-8")).hexdigest()
+
+    @staticmethod
+    def expand_past_key_values(past_key_values: Any, batch_size: int) -> Tuple:
+        """
+        Expands the past key values to the batch size
+
+        Args:
+            past_key_values (Any): The past key values to expand.
+            batch_size (int): The batch size to expand to.
+
+        Returns:
+            Tuple: The expanded past key values.
+        """
+        if batch_size == 1:
+            return tuple(
+                tuple(
+                    t.clone().contiguous() for t in layer
+                ) for layer in past_key_values
+            )
+        else:
+            return tuple(
+                tuple(
+                    tensor.expand(batch_size, *tensor.shape[1:]).contiguous() for tensor in layer
+                ) for layer in past_key_values
+            )
+
+    def get_prefix_entry(
+        self,
+        prefix_prompt: str,
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> Optional[PrefixCacheEntry]:
+        """
+        Gets the prefix entry from the cache or creates it if it doesn't exist
+
+        Args:
+            prefix_prompt (str): The prefix prompt to get the entry for.
+            model (PreTrainedModel): The model to create the prefix entry for.
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to use.
+
+        Returns:
+            Optional[PrefixCacheEntry]: The prefix entry if it exists, otherwise None.
+        """
+        if not prefix_prompt:
+            return None
+        key = PrefixCache.key(prefix_prompt)
+        cached = self._cache.get(key)
+        if cached is not None and cached.device == model.device:
+            self._cache.move_to_end(key)
+            return cached
+
+        inputs = tokenizer(prefix_prompt, add_special_tokens=False, return_tensors="pt")
+        inputs.to(model.device)
+        with torch.no_grad():
+            outputs = model(
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                use_cache=True,
+            )
+        if outputs.past_key_values is None:
+            return None
+
+        entry = PrefixCacheEntry(
+            input_ids=inputs.input_ids,
+            attention_mask=inputs.attention_mask,
+            past_key_values=outputs.past_key_values,
+            device=model.device,
+        )
+        self._cache[key] = entry
+        self._cache.move_to_end(key)
+        while len(self._cache) > self._max_entries:
+            self._cache.popitem(last=False)
+        return entry
+
+    def clear(self) -> None:
+        """Removes all entries from the prefix cache."""
+        self._cache.clear()
diff --git a/app/processors/tagging.py b/app/processors/tagging.py
index 3b45bc44..81b2fa3d 100644
--- a/app/processors/tagging.py
+++ b/app/processors/tagging.py
@@ -3,6 +3,7 @@
 from torch import nn, mean, cat
 from transformers import PreTrainedModel
 from app.domain import TaggingScheme
+from app.utils import parse_label_into_id_and_name
 
 
 class TagProcessor:
@@ -13,69 +14,39 @@ def update_model_by_tagging_scheme(
         concepts: List[str],
         tagging_scheme: TaggingScheme,
     ) -> PreTrainedModel:
-        avg_weight = mean(model.classifier.weight, dim=0, keepdim=True)
-        avg_bias = mean(model.classifier.bias, dim=0, keepdim=True)
+        """
+        Updates the token classification head of the model by appending new labels according to the tagging scheme.
+
+        Args:
+            model (PreTrainedModel): The Hugging Face token classification model to be updated.
+            concepts (List[str]): The list of concept names to be added as new labels.
+            tagging_scheme (TaggingScheme): The tagging scheme used for the model, either "flat","iob" or "iobes".
+
+        Returns:
+            PreTrainedModel: The updated model with new labels added to the classification head.
+        """
+        head_name, head = TagProcessor._get_classification_head(model)
+        avg_weight = mean(head.weight, dim=0, keepdim=True)
+        avg_bias = mean(head.bias, dim=0, keepdim=True)
         if tagging_scheme == TaggingScheme.IOB:
             for concept in concepts:
                 b_label = f"B-{concept}"
                 i_label = f"I-{concept}"
-                if b_label not in model.config.label2id.keys():
-                    model.config.label2id[b_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = b_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
-                if i_label not in model.config.label2id.keys():
-                    model.config.label2id[i_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = i_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
+                head = TagProcessor._append_label_to_head(model, head_name, head, b_label, avg_weight, avg_bias)
+                head = TagProcessor._append_label_to_head(model, head_name, head, i_label, avg_weight, avg_bias)
         elif tagging_scheme == TaggingScheme.IOBES:
             for concept in concepts:
                 s_label = f"S-{concept}"
                 b_label = f"B-{concept}"
                 i_label = f"I-{concept}"
                 e_label = f"E-{concept}"
-                if s_label not in model.config.label2id.keys():
-                    model.config.label2id[s_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = s_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
-                if b_label not in model.config.label2id.keys():
-                    model.config.label2id[b_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = b_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
-                if i_label not in model.config.label2id.keys():
-                    model.config.label2id[i_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = i_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
-                if e_label not in model.config.label2id.keys():
-                    model.config.label2id[e_label] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = e_label
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
+                head = TagProcessor._append_label_to_head(model, head_name, head, s_label, avg_weight, avg_bias)
+                head = TagProcessor._append_label_to_head(model, head_name, head, b_label, avg_weight, avg_bias)
+                head = TagProcessor._append_label_to_head(model, head_name, head, i_label, avg_weight, avg_bias)
+                head = TagProcessor._append_label_to_head(model, head_name, head, e_label, avg_weight, avg_bias)
         else:
             for concept in concepts:
-                if concept not in model.config.label2id.keys():
-                    model.config.label2id[concept] = len(model.config.label2id)
-                    model.config.id2label[len(model.config.id2label)] = concept
-                    model.classifier.weight = nn.Parameter(cat((model.classifier.weight, avg_weight), 0))
-                    model.classifier.bias = nn.Parameter(cat((model.classifier.bias, avg_bias), 0))
-                    model.classifier.out_features += 1
-                    model.num_labels += 1
+                head = TagProcessor._append_label_to_head(model, head_name, head, concept, avg_weight, avg_bias)
         return model
 
     @staticmethod
@@ -91,6 +62,24 @@ def generate_chuncks_by_tagging_scheme(
         window_size: int,
         stride: int,
     ) -> Iterable[Dict[str, Any]]:
+        """
+        Generates chunks of tokenized input along with corresponding labels and attention masks according to the tagging scheme.
+
+        Args:
+            annotations (List[Dict]): A list of annotations containing the entity spans and their corresponding CUIs.
+            tokenized (Dict[str, List]): The tokenized input containing "input_ids", "attention_mask" and "offset_mapping".
+            delfault_label_id (int): The label ID to be used for background tokens.
+            pad_token_id (int): The token ID used for padding the input sequences.
+            pad_label_id (int): The label ID used for padding the label sequences.
+            max_length (int): The maximum length of the input sequences after tokenization.
+            model (PreTrainedModel): The Hugging Face token classification model.
+            tagging_scheme (TaggingScheme): The tagging scheme used for the model, either "flat","iob" or "iobes".
+            window_size (int): The size of the sliding window for chunking the input sequences.
+            stride (int): The stride of the sliding window for chunking the input sequences.
+
+        Yields:
+            Dict[str, Any]: A dictionary containing the chunked "input_ids", "labels" and "attention_mask" for the input sequence.
+        """
         if tagging_scheme == TaggingScheme.IOB:
             labels = [delfault_label_id] * len(tokenized["input_ids"])
             for annotation in annotations:
@@ -103,7 +92,9 @@ def generate_chuncks_by_tagging_scheme(
                 i_label_id = model.config.label2id.get(i_label, delfault_label_id)
                 first_token = True
                 for idx, offset_mapping in enumerate(tokenized["offset_mapping"]):
-                    if start <= offset_mapping[0] and offset_mapping[1] <= end:
+                    if offset_mapping[0] == offset_mapping[1]:
+                        continue
+                    if start < offset_mapping[1] and offset_mapping[0] < end:
                         if first_token:
                             labels[idx] = b_label_id
                             first_token = False
@@ -129,21 +120,22 @@ def generate_chuncks_by_tagging_scheme(
         elif tagging_scheme == TaggingScheme.IOBES:
             labels = [delfault_label_id] * len(tokenized["input_ids"])
             for annotation in annotations:
-                ann_start = annotation["start"]
-                ann_end = annotation["end"]
+                start = annotation["start"]
+                end = annotation["end"]
                 cui = annotation["cui"]
 
-                covered_indices = [
-                    idx for idx, off in enumerate(tokenized["offset_mapping"])
-                    if ann_start <= off[0] and off[1] <= ann_end
+                span_token_indices = [
+                    idx for idx, offset_mapping in enumerate(tokenized["offset_mapping"])
+                    if offset_mapping[0] != offset_mapping[1] and start < offset_mapping[1] and offset_mapping[0] < end
                 ]
-                if not covered_indices:
+                if not span_token_indices:
                     continue
+                span_token_indices = list(range(span_token_indices[0], span_token_indices[-1] + 1))
 
-                if len(covered_indices) == 1:
+                if len(span_token_indices) == 1:
                     s_label = f"S-{cui}"
                     s_id = model.config.label2id.get(s_label, delfault_label_id)
-                    labels[covered_indices[0]] = s_id
+                    labels[span_token_indices[0]] = s_id
                 else:
                     b_label = f"B-{cui}"
                     i_label = f"I-{cui}"
@@ -152,10 +144,10 @@ def generate_chuncks_by_tagging_scheme(
                     i_id = model.config.label2id.get(i_label, delfault_label_id)
                     e_id = model.config.label2id.get(e_label, delfault_label_id)
 
-                    labels[covered_indices[0]] = b_id
-                    for mid_idx in covered_indices[1:-1]:
+                    labels[span_token_indices[0]] = b_id
+                    for mid_idx in span_token_indices[1:-1]:
                         labels[mid_idx] = i_id
-                    labels[covered_indices[-1]] = e_id
+                    labels[span_token_indices[-1]] = e_id
 
             for start in range(0, len(tokenized["input_ids"]), stride):
                 end = min(start + window_size, len(tokenized["input_ids"]))
@@ -180,11 +172,13 @@ def generate_chuncks_by_tagging_scheme(
                 chunked_labels = [0] * len(chunked_input_ids)
                 chunked_attention_mask = tokenized["attention_mask"][start:end]
                 for annotation in annotations:
-                    annotation_start = annotation["start"]
-                    annotation_end = annotation["end"]
+                    start = annotation["start"]
+                    end = annotation["end"]
                     label_id = model.config.label2id.get(annotation["cui"], delfault_label_id)
                     for idx, offset_mapping in enumerate(chunked_offsets_mapping):
-                        if annotation_start <= offset_mapping[0] and offset_mapping[1] <= annotation_end:
+                        if offset_mapping[0] == offset_mapping[1]:
+                            continue
+                        if start < offset_mapping[1] and offset_mapping[0] < end:
                             chunked_labels[idx] = label_id
                 padding_length = max(0, max_length - len(chunked_input_ids))
                 chunked_input_ids += [pad_token_id] * padding_length
@@ -203,6 +197,17 @@ def aggregate_bioes_predictions(
         text: str,
         include_span_text: bool = False,
     ) -> List[Dict[str, Any]]:
+        """
+        Aggregates token-level predictions into entity-level predictions according to the IOB/IOBES tagging scheme.
+
+        Args:
+            df (pd.DataFrame): A DataFrame containing the token-level predictions.
+            text (str): The original input text from which the tokens were derived.
+            include_span_text (bool): If True, include the text of the entity span in the output. Defaults to False.
+s
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing the aggregated entity-level predictions.
+        """
         aggregated_entities = []
         current_entity = None
         current_label = None
@@ -218,7 +223,7 @@ def aggregate_bioes_predictions(
             if entity_tag.upper() == "O" or entity_tag == "":
                 if current_entity is not None:
                     aggregated_entities.append(
-                        TagProcessor._get_composed_entitiy(
+                        TagProcessor._get_composed_entity(
                             text,
                             current_entity,
                             current_label,
@@ -243,7 +248,7 @@ def aggregate_bioes_predictions(
             if prefix == "B":
                 if current_entity is not None:
                     aggregated_entities.append(
-                        TagProcessor._get_composed_entitiy(
+                        TagProcessor._get_composed_entity(
                             text,
                             current_entity,
                             current_label,
@@ -270,7 +275,7 @@ def aggregate_bioes_predictions(
                         token_count += 1
                     else:
                         aggregated_entities.append(
-                            TagProcessor._get_composed_entitiy(
+                            TagProcessor._get_composed_entity(
                                 text,
                                 current_entity,
                                 current_label,
@@ -288,7 +293,9 @@ def aggregate_bioes_predictions(
                 if current_entity is None:
                     single_ent = {"start": start, "end": end}
                     aggregated_entities.append(
-                        TagProcessor._get_composed_entitiy(text, single_ent, label, score, 1, include_span_text)
+                        TagProcessor._get_composed_entity(
+                            text, single_ent, label, score, 1, include_span_text
+                        )
                     )
                 else:
                     if label == current_label:
@@ -296,7 +303,7 @@ def aggregate_bioes_predictions(
                         current_score += score
                         token_count += 1
                         aggregated_entities.append(
-                            TagProcessor._get_composed_entitiy(
+                            TagProcessor._get_composed_entity(
                                 text,
                                 current_entity,
                                 current_label,
@@ -311,7 +318,7 @@ def aggregate_bioes_predictions(
                         token_count = 0
                     else:
                         aggregated_entities.append(
-                            TagProcessor._get_composed_entitiy(
+                            TagProcessor._get_composed_entity(
                                 text,
                                 current_entity,
                                 current_label,
@@ -320,10 +327,8 @@ def aggregate_bioes_predictions(
                                 include_span_text,
                             )
                         )
-                        single_ent = {"start": start, "end": end}
-                        aggregated_entities.append(
-                            TagProcessor._get_composed_entitiy(text, single_ent, label, score, 1, include_span_text)
-                        )
+
+                        # Close current entity and discard the stray E- with mismatched label
                         current_entity = None
                         current_label = None
                         current_score = 0.0
@@ -332,7 +337,7 @@ def aggregate_bioes_predictions(
             elif prefix == "S" or prefix is None:
                 if current_entity is not None:
                     aggregated_entities.append(
-                        TagProcessor._get_composed_entitiy(
+                        TagProcessor._get_composed_entity(
                             text,
                             current_entity,
                             current_label,
@@ -347,13 +352,13 @@ def aggregate_bioes_predictions(
                     token_count = 0
                 single_ent = {"start": start, "end": end}
                 aggregated_entities.append(
-                    TagProcessor._get_composed_entitiy(text, single_ent, label, score, 1, include_span_text)
+                    TagProcessor._get_composed_entity(text, single_ent, label, score, 1, include_span_text)
                 )
 
             else:
                 if current_entity is not None:
                     aggregated_entities.append(
-                        TagProcessor._get_composed_entitiy(
+                        TagProcessor._get_composed_entity(
                             text,
                             current_entity,
                             current_label,
@@ -369,7 +374,7 @@ def aggregate_bioes_predictions(
 
         if current_entity is not None:
             aggregated_entities.append(
-                TagProcessor._get_composed_entitiy(
+                TagProcessor._get_composed_entity(
                     text,
                     current_entity,
                     current_label,
@@ -382,7 +387,7 @@ def aggregate_bioes_predictions(
         return aggregated_entities
 
     @staticmethod
-    def _get_composed_entitiy(
+    def _get_composed_entity(
         text: str,
         entity: Dict,
         label: Optional[str],
@@ -390,13 +395,62 @@ def _get_composed_entitiy(
         token_count: int,
         include_span_text: bool,
     ) -> Dict[str, Any]:
+        label_id, label_name = parse_label_into_id_and_name(label)
         return {
             "entity_group": label,
-            "label_name": label,
-            "label_id": label,
+            "label_name": label_name,
+            "label_id": label_id,
             "start": entity["start"],
             "end": entity["end"],
             "score": score / token_count,
             "accuracy": score / token_count,
             "text": text[entity["start"]:entity["end"]] if include_span_text else None
         }
+
+    @staticmethod
+    def _get_classification_head(model: PreTrainedModel) -> tuple[str, nn.Linear]:
+        head_name = ""
+        head_module = None
+        for name, module in model.named_modules():
+            if isinstance(module, nn.Linear) and module.out_features == model.config.num_labels:
+                head_name = name
+                head_module = module
+
+        # Fallback for test doubles/mocks that do not implement named_modules() like real HF models.
+        if (not head_name or head_module is None) and hasattr(model, "classifier"):
+            return "classifier", model.classifier
+        if (not head_name or head_module is None) and hasattr(model, "score"):
+            return "score", model.score
+        if not head_name or head_module is None:
+            raise AttributeError("Unable to locate the token classification head from model.named_modules().")
+        return head_name, head_module
+
+    @staticmethod
+    def _set_module_by_name(model: PreTrainedModel, module_name: str, module: nn.Module) -> None:
+        if "." in module_name:
+            parent_name, child_name = module_name.rsplit(".", 1)
+            parent_module = model.get_submodule(parent_name)
+            setattr(parent_module, child_name, module)
+        else:
+            setattr(model, module_name, module)
+
+    @staticmethod
+    def _append_label_to_head(
+            model: PreTrainedModel,
+            head_name: str,
+            head: nn.Linear,
+            label: str,
+            avg_weight: Any,
+            avg_bias: Any,
+    ) -> nn.Linear:
+        if label in model.config.label2id.keys():
+            return head
+        model.config.label2id[label] = len(model.config.label2id)
+        model.config.id2label[len(model.config.id2label)] = label
+        head.weight = nn.Parameter(cat((head.weight, avg_weight), 0))
+        head.bias = nn.Parameter(cat((head.bias, avg_bias), 0))
+        if hasattr(head, "out_features"):
+            head.out_features += 1
+        model.num_labels += 1
+        TagProcessor._set_module_by_name(model, head_name, head)
+        return head
diff --git a/app/processors/viterbi_decoder.py b/app/processors/viterbi_decoder.py
new file mode 100644
index 00000000..2919c383
--- /dev/null
+++ b/app/processors/viterbi_decoder.py
@@ -0,0 +1,499 @@
+import torch
+from typing import Any, Dict, List, Sequence, Optional
+from dataclasses import dataclass
+from typing import Mapping
+
+VITERBI_BIAS_KEYS = (
+    "transition_bias_background_stay",
+    "transition_bias_background_to_start",
+    "transition_bias_inside_to_continue",
+    "transition_bias_inside_to_end",
+    "transition_bias_end_to_background",
+    "transition_bias_end_to_start",
+)
+
+
+@dataclass(frozen=True)
+class LabelInfo:
+    """Label-space mappings used for inference and span decoding."""
+
+    boundary_label_lookup: Mapping[str, Mapping[str, int]]
+    token_to_span_label: Mapping[int, int]
+    token_boundary_tags: Mapping[int, str | None]
+    span_class_names: tuple[str, ...]
+    span_label_lookup: Mapping[str, int]
+    background_token_label: int
+    background_span_label: int
+
+
+class ViterbiDecoder:
+    """CRF-style Viterbi decoder for BIOES token classification."""
+
+    def __init__(
+        self,
+        label_info: LabelInfo,
+        transition_bias_background_stay: float = 0.0,
+        transition_bias_background_to_start: float = 0.0,
+        transition_bias_inside_to_continue: float = 0.0,
+        transition_bias_inside_to_end: float = 0.0,
+        transition_bias_end_to_background: float = 0.0,
+        transition_bias_end_to_start: float = 0.0,
+    ):
+        self.label_info = label_info
+        self.transition_bias_background_stay = transition_bias_background_stay
+        self.transition_bias_background_to_start = transition_bias_background_to_start
+        self.transition_bias_inside_to_continue = transition_bias_inside_to_continue
+        self.transition_bias_inside_to_end = transition_bias_inside_to_end
+        self.transition_bias_end_to_background = transition_bias_end_to_background
+        self.transition_bias_end_to_start = transition_bias_end_to_start
+
+        boundary_tags = set(self.label_info.token_boundary_tags.values())
+        self._has_explicit_end_labels = "E" in boundary_tags
+        self._has_explicit_single_labels = "S" in boundary_tags
+        self._precompute_scores()
+
+    @classmethod
+    def from_id2label(
+        cls,
+        id2label: Dict[int, str],
+        viterbi_biases: Optional[Dict[str, float]] = None,
+    ) -> Optional["ViterbiDecoder"]:
+        """
+        Constructs a Viterbi decoder from an id2label mapping.
+
+        Args:
+            id2label (Dict[int, str]): A mapping of label ids to label names.
+            viterbi_biases (Optional[Dict[str, float]]): A mapping of Viterbi bias keys to bias values.
+
+        Returns:
+            Optional[ViterbiDecoder]: A configured Viterbi decoder, or None if tagging scheme is not IOB/IOBES.
+        """
+        O_LABEL = "O"
+        BOUNDARY_PREFIXES = {"B", "I", "E", "S"}
+
+        span_class_names: List[str] = [O_LABEL]
+        span_label_lookup: Dict[str, int] = {O_LABEL: 0}
+        boundary_label_lookup: Dict[str, Dict[str, int]] = {}
+        token_to_span_label: Dict[int, int] = {}
+        token_boundary_tags: Dict[int, str | None] = {}
+        background_idx: Optional[int] = None
+
+        for idx, name in id2label.items():
+            idx = int(idx)
+            if name == O_LABEL:
+                background_idx = idx
+                token_to_span_label[idx] = span_label_lookup[O_LABEL]
+                token_boundary_tags[idx] = None
+                continue
+
+            parts = name.split("-", 1)
+            if len(parts) != 2:
+                if background_idx is None:
+                    background_idx = idx
+                token_to_span_label[idx] = span_label_lookup[O_LABEL]
+                token_boundary_tags[idx] = None
+                continue
+
+            boundary, base_label = parts
+            if boundary not in BOUNDARY_PREFIXES:
+                if background_idx is None:
+                    background_idx = idx
+                token_to_span_label[idx] = span_label_lookup[O_LABEL]
+                token_boundary_tags[idx] = None
+                continue
+
+            span_idx = span_label_lookup.get(base_label)
+            if span_idx is None:
+                span_idx = len(span_class_names)
+                span_class_names.append(base_label)
+                span_label_lookup[base_label] = span_idx
+
+            token_to_span_label[idx] = span_idx
+            token_boundary_tags[idx] = boundary
+            mapping = boundary_label_lookup.setdefault(base_label, {})
+            mapping[boundary] = idx
+
+        if background_idx is None:
+            return None
+
+        for base_label, mapping in boundary_label_lookup.items():
+            present = set(mapping)
+            if "B" not in present or "I" not in present:
+                return None
+            if "E" in present and "S" not in present:
+                return None
+            if "S" in present and "E" not in present:
+                return None
+
+        label_info = LabelInfo(
+            boundary_label_lookup={key: dict(value) for key, value in boundary_label_lookup.items()},
+            token_to_span_label=dict(token_to_span_label),
+            token_boundary_tags=dict(token_boundary_tags),
+            span_class_names=tuple(span_class_names),
+            span_label_lookup=dict(span_label_lookup),
+            background_token_label=background_idx,
+            background_span_label=span_label_lookup[O_LABEL],
+        )
+        biases = viterbi_biases or {}
+        return cls(
+            label_info=label_info,
+            transition_bias_background_stay=biases.get("transition_bias_background_stay", 0.0),
+            transition_bias_background_to_start=biases.get("transition_bias_background_to_start", 0.0),
+            transition_bias_inside_to_continue=biases.get("transition_bias_inside_to_continue", 0.0),
+            transition_bias_inside_to_end=biases.get("transition_bias_inside_to_end", 0.0),
+            transition_bias_end_to_background=biases.get("transition_bias_end_to_background", 0.0),
+            transition_bias_end_to_start=biases.get("transition_bias_end_to_start", 0.0),
+        )
+
+    def apply_viterbi_to_hf_pipeline_output(
+        self,
+        pipeline_output: List[Dict[str, Any]],
+        id2label: Dict[int, str],
+    ) -> List[Dict[str, Any]]:
+        """
+        Applies Viterbi decoding to a list of pipeline results.
+
+        Args:
+            pipeline_output (List[Dict[str, Any]]): A list of pipeline results.
+            id2label (Dict[int, str]): A mapping of label ids to label names.
+
+        Returns:
+            List[Dict[str, Any]]: A list of pipeline results with Viterbi decoding applied.
+        """
+        label2id = {label: label_id for label_id, label in id2label.items()}
+        num_tokens = len(pipeline_output)
+        num_classes = len(id2label)
+        log_probs = torch.full((num_tokens, num_classes), -1e9)
+
+        def _resolve_label_id(entity_label: str) -> int:
+            if entity_label in label2id:
+                return label2id[entity_label]
+            suffix = entity_label.split("-")[-1]
+            if suffix.isdigit():
+                suffix_id = int(suffix)
+                if suffix_id in id2label:
+                    return suffix_id
+            return label2id.get("O", 0)
+
+        for i, result in enumerate(pipeline_output):
+            label_id = _resolve_label_id(result.get("entity", "O"))
+            score = result.get("score", 0.5)
+            log_probs[i, label_id] = torch.log(torch.tensor(score))
+
+        viterbi_ids = self.decode(log_probs)
+        corrected_results = []
+        for result, viterbi_id in zip(pipeline_output, viterbi_ids):
+            corrected_result = result.copy()
+            corrected_result["entity"] = id2label.get(viterbi_id, "O")
+            corrected_results.append(corrected_result)
+        return corrected_results
+
+    def decode(self, token_logprobs: torch.Tensor) -> List[int]:
+        """
+        Decodes one log probability tensor into label ids.
+
+        Args:
+            token_logprobs (torch.Tensor): A log probability tensor with the shape (seq_len, num_classes).
+        Returns:
+            List[int]: The list of label ids with length of (seq_len).
+        """
+        if token_logprobs.ndim != 2:
+            raise ValueError("Token logprobs must have shape (seq_len, num_classes)")
+
+        seq_len, num_classes = token_logprobs.shape
+        if seq_len == 0:
+            return []
+
+        device = token_logprobs.device
+        dtype = token_logprobs.dtype
+        if self._start_scores.device == device and self._start_scores.dtype == dtype:
+            start_scores = self._start_scores
+            end_scores = self._end_scores
+            transition_scores = self._transition_scores
+        else:
+            device_index = device.index if device.index is not None else -1
+            cache_key = (device.type, device_index, dtype)
+            cached_scores = self._score_cache.get(cache_key)
+            if cached_scores is None:
+                cached_scores = (
+                    self._start_scores.to(device=device, dtype=dtype),
+                    self._end_scores.to(device=device, dtype=dtype),
+                    self._transition_scores.to(device=device, dtype=dtype),
+                )
+                self._score_cache[cache_key] = cached_scores
+            start_scores, end_scores, transition_scores = cached_scores
+
+        scores = token_logprobs[0] + start_scores
+        backpointers = torch.empty((seq_len - 1, num_classes), device=device, dtype=torch.int64)
+
+        for idx in range(1, seq_len):
+            transitions = scores.unsqueeze(1) + transition_scores
+            best_scores, best_paths = transitions.max(dim=0)
+            scores = best_scores + token_logprobs[idx]
+            backpointers[idx - 1] = best_paths
+
+        if not torch.isfinite(scores).any():
+            return token_logprobs.argmax(dim=1).tolist()
+
+        scores = scores + end_scores
+        last_label = scores.argmax()
+        path = torch.empty((seq_len,), device=device, dtype=torch.int64)
+        path[-1] = last_label
+        for idx in range(seq_len - 2, -1, -1):
+            last_label = backpointers[idx, last_label]
+            path[idx] = last_label
+        return path.tolist()
+
+    def decode_many(
+        self,
+        token_logprobs_list: Sequence[torch.Tensor],
+        device: Optional[torch.device] = None,
+        batch_size: int = 128,
+    ) -> List[List[int]]:
+        """
+        Decodes multiple log probability tensors into a list of label ids.
+
+        Args:
+            token_logprobs_list (Sequence[torch.Tensor]): A list of log probability tensors with the shape (seq_len, num_classes).
+            device (tOptional[torch.device]): The device to run the decoding on.
+            batch_size (int): The batch size used for GPU decoding. Defaults to 128.
+
+        Returns:
+            List[List[int]]: The list of label ids with the shape (batch_size, seq_len).
+
+        """
+        if not token_logprobs_list:
+            return []
+        if batch_size <= 0:
+            raise ValueError("max_batch_size must be positive")
+        if device is None or device.type != "cuda":
+            return [self.decode(scores) for scores in token_logprobs_list]
+
+        lengths = [int(scores.shape[0]) for scores in token_logprobs_list]
+        if any(scores.ndim != 2 for scores in token_logprobs_list):
+            raise ValueError("decode_many expects [seq_len, num_classes] tensors")
+        if any(length <= 0 for length in lengths):
+            return [self.decode(scores) for scores in token_logprobs_list]
+
+        order = sorted(
+            range(len(token_logprobs_list)),
+            key=lambda idx: lengths[idx],
+            reverse=True,
+        )
+        results: List[List[int] | None] = [None] * len(token_logprobs_list)
+        for start in range(0, len(order), batch_size):
+            batch_indices = order[start:start + batch_size]
+            batch_scores = [token_logprobs_list[idx] for idx in batch_indices]
+            batch_lengths = [lengths[idx] for idx in batch_indices]
+            batch_size = len(batch_scores)
+            if batch_size == 0:
+                return []
+            num_classes = int(batch_scores[0].shape[1])
+            max_len = int(max(batch_lengths))
+            dtype = batch_scores[0].dtype
+            for scores in batch_scores:
+                if int(scores.shape[1]) != num_classes:
+                    raise ValueError("All decode_many tensors must share the same class dimension")
+
+            emissions = torch.full(
+                (batch_size, max_len, num_classes),
+                -float("inf"),
+                device=device,
+                dtype=dtype,
+            )
+            for row, (scores, length) in enumerate(zip(batch_scores, batch_lengths)):
+                if scores.device != device or scores.dtype != dtype:
+                    scores = scores.to(device=device, dtype=dtype)
+                emissions[row, :length] = scores
+
+            lengths_t = torch.tensor(batch_lengths, device=device, dtype=torch.long)
+            device_index = device.index if device.index is not None else -1
+            cache_key = (device.type, device_index, dtype)
+            cached_scores = self._score_cache.get(cache_key)
+            if cached_scores is None:
+                cached_scores = (
+                    self._start_scores.to(device=device, dtype=dtype),
+                    self._end_scores.to(device=device, dtype=dtype),
+                    self._transition_scores.to(device=device, dtype=dtype),
+                )
+                self._score_cache[cache_key] = cached_scores
+            start_scores, end_scores, transition_scores = cached_scores
+
+            scores = emissions[:, 0, :] + start_scores[None, :]
+            backpointer_dtype = torch.int16 if num_classes <= 32767 else torch.int32
+            backpointers = torch.zeros(
+                (max_len - 1, batch_size, num_classes),
+                device=device,
+                dtype=backpointer_dtype,
+            )
+            batch_arange = torch.arange(batch_size, device=device, dtype=torch.long)
+
+            for step in range(1, max_len):
+                active = lengths_t > step
+                if not bool(active.any().item()):
+                    break
+                active_idx = batch_arange[active]
+                transitions = scores[active_idx].unsqueeze(2) + transition_scores
+                best_scores, best_paths = transitions.max(dim=1)
+                scores[active_idx] = best_scores + emissions[active_idx, step, :]
+                backpointers[step - 1, active_idx] = best_paths.to(backpointer_dtype)
+
+            bad_rows = ~torch.isfinite(scores).any(dim=1)
+            scores = scores + end_scores[None, :]
+            last_labels = scores.argmax(dim=1)
+            paths = torch.zeros((batch_size, max_len), device=device, dtype=torch.long)
+            paths[batch_arange, lengths_t - 1] = last_labels
+            for step in range(max_len - 2, -1, -1):
+                active = lengths_t > (step + 1)
+                if not bool(active.any().item()):
+                    continue
+                active_idx = batch_arange[active]
+                next_labels = paths[active_idx, step + 1]
+                prev = backpointers[step, active_idx, next_labels].to(torch.long)
+                paths[active_idx, step] = prev
+
+            if bool(bad_rows.any().item()):
+                fallback_paths = emissions.argmax(dim=2)
+                bad_idx = batch_arange[bad_rows]
+                for idx in bad_idx.tolist():
+                    length = int(lengths_t[idx].item())
+                    paths[idx, :length] = fallback_paths[idx, :length]
+
+            decoded_batch: List[List[int]] = []
+            for row, length in enumerate(batch_lengths):
+                decoded_batch.append(paths[row, :length].tolist())
+
+            for original_idx, decoded_seq in zip(batch_indices, decoded_batch):
+                results[original_idx] = decoded_seq
+
+        output: List[List[int]] = []
+        for decoded in results:
+            if decoded is None:
+                raise RuntimeError("Internal decode_many failure: missing decoded sequence")
+            assert decoded is not None
+            output.append(decoded)
+        return output
+
+    def _is_valid_transition(
+        self,
+        prev_tag: str | None,
+        prev_span: int | None,
+        next_tag: str | None,
+        next_span: int | None,
+        background_token_idx: int,
+        background_span_idx: int,
+        next_idx: int,
+    ) -> bool:
+        next_is_background = next_span == background_span_idx or next_idx == background_token_idx
+        if (next_span is None or next_tag is None) and not next_is_background:
+            return False
+
+        if prev_span is None or prev_tag is None:
+            return next_is_background or next_tag in {"B", "S"}
+
+        prev_is_background = prev_span == background_span_idx
+
+        if prev_is_background:
+            return next_is_background or next_tag in {"B", "S"}
+
+        if prev_tag in {"E", "S"}:
+            return next_is_background or next_tag in {"B", "S"}
+
+        if prev_tag in {"B", "I"}:
+            same_span = prev_span == next_span
+            if same_span and next_tag in {"I", "E"}:
+                return True
+            if not self._has_explicit_end_labels:
+                return next_is_background or next_tag in {"B", "S"}
+            return False
+
+        return False
+
+    def _precompute_scores(self) -> None:
+        num_classes = len(self.label_info.token_to_span_label)
+        self._start_scores = torch.full((num_classes,), -1e9, dtype=torch.float32)
+        self._end_scores = torch.full((num_classes,), -1e9, dtype=torch.float32)
+        self._transition_scores = torch.full((num_classes, num_classes), -1e9, dtype=torch.float32)
+        self._score_cache: dict[
+            tuple[str, int, torch.dtype],
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        ] = {}
+
+        background_token_idx = self.label_info.background_token_label
+        background_span_idx = self.label_info.background_span_label
+        token_boundary_tags = self.label_info.token_boundary_tags
+        token_to_span_label = self.label_info.token_to_span_label
+
+        for idx in range(num_classes):
+            tag = token_boundary_tags.get(idx)
+            span_label = token_to_span_label.get(idx)
+            if tag in {"B", "S"} or idx == background_token_idx:
+                self._start_scores[idx] = 0.0
+            if tag in {"E", "S"} or idx == background_token_idx or (
+                not self._has_explicit_end_labels and tag in {"B", "I"}
+            ):
+                self._end_scores[idx] = 0.0
+            elif span_label == background_span_idx:
+                self._start_scores[idx] = 0.0
+                self._end_scores[idx] = 0.0
+
+            for next_idx in range(num_classes):
+                next_tag = token_boundary_tags.get(next_idx)
+                next_span_label = token_to_span_label.get(next_idx)
+                if self._is_valid_transition(
+                    prev_tag=tag,
+                    prev_span=span_label,
+                    next_tag=next_tag,
+                    next_span=next_span_label,
+                    background_token_idx=background_token_idx,
+                    background_span_idx=background_span_idx,
+                    next_idx=next_idx,
+                ):
+                    self._transition_scores[idx, next_idx] = self._transition_bias(
+                        prev_tag=tag,
+                        prev_span=span_label,
+                        next_tag=next_tag,
+                        next_span=next_span_label,
+                        background_token_idx=background_token_idx,
+                        background_span_idx=background_span_idx,
+                        prev_idx=idx,
+                        next_idx=next_idx,
+                    )
+
+    def _transition_bias(
+        self,
+        *,
+        prev_tag: str | None,
+        prev_span: int | None,
+        next_tag: str | None,
+        next_span: int | None,
+        background_token_idx: int,
+        background_span_idx: int,
+        prev_idx: int,
+        next_idx: int,
+    ) -> float:
+        prev_is_background = (prev_span == background_span_idx) or (prev_idx == background_token_idx)
+        next_is_background = (next_span == background_span_idx) or (next_idx == background_token_idx)
+
+        if prev_is_background:
+            if next_is_background:
+                return self.transition_bias_background_stay
+            if next_tag in {"B", "S"}:
+                return self.transition_bias_background_to_start
+            return 0.0
+
+        if prev_tag in {"B", "I"}:
+            if next_tag == "I" and prev_span == next_span:
+                return self.transition_bias_inside_to_continue
+            if next_tag == "E" and prev_span == next_span:
+                return self.transition_bias_inside_to_end
+            return 0.0
+
+        if prev_tag in {"E", "S"}:
+            if next_is_background:
+                return self.transition_bias_end_to_background
+            if next_tag in {"B", "S"}:
+                return self.transition_bias_end_to_start
+            return 0.0
+
+        return 0.0
diff --git a/app/trainers/huggingface_llm_trainer.py b/app/trainers/huggingface_llm_trainer.py
index 41900ff6..caf1b683 100644
--- a/app/trainers/huggingface_llm_trainer.py
+++ b/app/trainers/huggingface_llm_trainer.py
@@ -23,9 +23,9 @@
     DataCollatorForLanguageModeling,
     Trainer,
 )
-from peft import LoraConfig, get_peft_model # type: ignore
 from app.management.model_manager import ModelManager
 from app.management.tracker_client import TrackerClient
+from app.processors.lora_adaptor import LoraAdaptor
 from app.utils import (
     reset_random_seed,
     non_default_device_is_available,
@@ -35,6 +35,7 @@
     get_default_chat_template,
     get_default_system_prompt,
     get_model_data_package_base_name,
+    save_model_to_clean_directory,
 )
 from app.trainers.base import SupervisedTrainer, UnsupervisedTrainer
 from app.domain import (
@@ -249,9 +250,10 @@ def make_conversation(example: Dict[str, Any]) -> Dict[str, Any]:
                 }
             elif "question" in example and "answer" in example:
                 # Question/Answer format
+                instruction = example["instruction"] if "instruction" in example else None
                 return {
                     "prompt": [
-                        {"role": "system", "content": system_prompt},
+                        {"role": "system", "content": instruction if instruction is not None else system_prompt},
                         {"role": "user", "content": example.get("question")},
                     ],
                     "answer": example["answer"],
@@ -357,7 +359,23 @@ def run(
                 else:
                     logger.debug(f"Found a chat template in the tokenizer:\n {tokenizer.chat_template}")
 
-                lora_config = LoraConfig(
+                tokenized_lengths = train_dataset.map(
+                    lambda x: {
+                        "tokens": tokenizer.apply_chat_template(
+                            x["prompt"],
+                            add_generation_prompt=True,
+                            tokenize=True,
+                        )
+                    },
+                    batched=True,
+                ).map(lambda x: {"length": len(x["tokens"])})["length"]
+                max_prompt_length = max(tokenized_lengths) + 1
+                training_token_count = sum(tokenized_lengths)
+                logger.debug(f"Total training tokens: {training_token_count}")
+                self._tracker_client.log_training_token_count(training_token_count)
+
+                peft_model, _ = LoraAdaptor.apply(
+                    model=model,
                     task_type="CAUSAL_LM",
                     r=8,
                     lora_alpha=32,
@@ -368,23 +386,11 @@ def run(
                     ],
                 )
 
-                peft_model = get_peft_model(model, lora_config)
-
                 mlflow_logging_callback = MLflowLoggingCallback(self._tracker_client)
                 cancel_event_check_callback = CancelEventCheckCallback(self._cancel_event)
                 trainer_callbacks = [mlflow_logging_callback, cancel_event_check_callback]
 
                 trainer_type = training_params.get("trainer_type", LlmTrainerType.GRPO.value).lower()
-                max_prompt_length = max(train_dataset.map(
-                    lambda x: {
-                        "tokens": tokenizer.apply_chat_template(
-                            x["prompt"],
-                            add_generation_prompt=True,
-                            tokenize=True
-                        )
-                    },
-                    batched=True,
-                ).map(lambda x: {"length": len(x["tokens"])})["length"]) + 1
                 if trainer_type == LlmTrainerType.PPO.value:
                     raise NotImplementedError("PPO training is not yet supported for HuggingFace LLM models")
                 elif trainer_type == LlmTrainerType.GRPO.value:
@@ -435,11 +441,12 @@ def run(
                     model_pack_file_name = f"{ModelType.HUGGINGFACE_LLM.value}_{run_id}{model_pack_file_ext}"
                     retrained_model_pack_path = os.path.join(self._retrained_models_dir, model_pack_file_name)
                     model = peft_model.merge_and_unload()
-                    model.save_pretrained(
+                    save_model_to_clean_directory(
+                        model,
+                        tokenizer,
                         trained_model_directory,
                         safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
                     )
-                    tokenizer.save_pretrained(trained_model_directory)
                     create_model_data_package(trained_model_directory, retrained_model_pack_path)
                     model_uri = self._tracker_client.save_model(
                         retrained_model_pack_path,
@@ -689,7 +696,12 @@ def _evaluate_with_rewards(
             messages = example["prompt"]
             answer = example.get("answer", "")
 
-            prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            prompt_text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
             inputs = tokenizer(prompt_text, return_tensors="pt")
             input_ids = inputs["input_ids"]
             attention_mask = inputs.get("attention_mask")
@@ -792,7 +804,8 @@ def run(
                 copied_model_directory = None
                 if self._model_service.is_quantised:
                     logger.info("Use the LoRA adaptor for the quantised model...")
-                    lora_config = LoraConfig(
+                    model, _ = LoraAdaptor.apply(
+                        model=self._model_service.model,
                         task_type="CAUSAL_LM",
                         r=8,
                         lora_alpha=32,
@@ -802,7 +815,6 @@ def run(
                             "gate_proj", "up_proj", "down_proj",
                         ],
                     )
-                    model = get_peft_model(self._model_service.model, lora_config)
                     tokenizer = self._model_service.tokenizer
                 else:
                     logger.info("Loading a new model copy for training...")
@@ -851,6 +863,10 @@ def run(
                     remove_columns=["text"],
                 )
 
+                training_token_count = sum(len(example["input_ids"]) for example in train_dataset)
+                logger.debug(f"Total training tokens: {training_token_count}")
+                self._tracker_client.log_training_token_count(training_token_count)
+
                 data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
                 training_args = TrainingArguments(
@@ -897,20 +913,19 @@ def run(
                         os.path.dirname(retrained_model_pack_path),
                         get_model_data_package_base_name(retrained_model_pack_path),
                     )
-                    if hasattr(model, "merge_and_unload"):
+                    use_trained_directory = hasattr(model, "merge_and_unload")
+                    if use_trained_directory:
                         model = model.merge_and_unload()
-                        model.save_pretrained(
-                            trained_model_directory,
-                            safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
-                        )
-                        tokenizer.save_pretrained(trained_model_directory)
-                        create_model_data_package(trained_model_directory, retrained_model_pack_path)
-                    else:
-                        model.save_pretrained(
-                            copied_model_directory, # type: ignore
-                            safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
-                        )
-                        create_model_data_package(copied_model_directory, retrained_model_pack_path)    # type: ignore
+                    save_model_to_clean_directory(
+                        model,
+                        tokenizer,
+                        trained_model_directory if use_trained_directory else copied_model_directory, # type: ignore
+                        safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
+                    )
+                    create_model_data_package(
+                        trained_model_directory if use_trained_directory else copied_model_directory, # type: ignore
+                        retrained_model_pack_path,
+                    )
 
                     self._tracker_client.log_model_config(model.config.to_dict())   # type: ignore
 
diff --git a/app/trainers/huggingface_ner_trainer.py b/app/trainers/huggingface_ner_trainer.py
index 74f691cc..9502786a 100644
--- a/app/trainers/huggingface_ner_trainer.py
+++ b/app/trainers/huggingface_ner_trainer.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import math
+import inspect
 import torch
 import gc
 import json
@@ -17,7 +18,6 @@
 from sklearn.metrics import precision_recall_fscore_support, accuracy_score as sklearn_accuracy_score
 from sklearn.utils.class_weight import compute_class_weight
 from seqeval.metrics import classification_report, accuracy_score as seqeval_accuracy_score
-from scipy.special import softmax
 from transformers import __version__ as transformers_version
 from transformers import (
     AutoModelForMaskedLM,
@@ -37,6 +37,7 @@
 from app.management.model_manager import ModelManager
 from app.management.tracker_client import TrackerClient
 from app.processors.metrics_collector import get_stats_from_trainer_export, sanity_check_model_with_trainer_export
+from app.processors.lora_adaptor import LoraAdaptor
 from app.utils import (
     filter_by_concept_ids,
     reset_random_seed,
@@ -45,6 +46,8 @@
     get_model_data_package_extension,
     ensure_tensor_contiguity,
     get_model_data_package_base_name,
+    freeze_hf_model_params_by_names,
+    save_model_to_clean_directory,
 )
 from app.trainers.base import UnsupervisedTrainer, SupervisedTrainer
 from app.domain import ModelType, DatasetSplit, HfTransformerBackbone, Device, TrainerBackend, TaggingScheme
@@ -57,6 +60,7 @@
 
 
 class _HuggingFaceNerTrainerCommon(object):
+    TRAINING_CONTEXT_CAP = 512
 
     @staticmethod
     def deploy_model(
@@ -71,6 +75,66 @@ def deploy_model(
         model_service.tokenizer = tokenizer
         logger.info("Retrained model deployed")
 
+    @staticmethod
+    def _create_training_arguments(**kwargs: Any) -> TrainingArguments:
+        valid_args = set(inspect.signature(TrainingArguments.__init__).parameters.keys())
+
+        # Handle naming differences across transformers versions
+        if "eval_strategy" in kwargs and "eval_strategy" not in valid_args:
+            kwargs["evaluation_strategy"] = kwargs.pop("eval_strategy")
+        if "evaluation_strategy" in kwargs and "evaluation_strategy" not in valid_args:
+            kwargs["eval_strategy"] = kwargs.pop("evaluation_strategy")
+
+        filtered = {k: v for k, v in kwargs.items() if k in valid_args}
+        dropped = set(kwargs.keys()) - set(filtered.keys())
+        if dropped:
+            logger.debug(
+                "Ignoring unsupported training arguments for this transformers installed: %s",sorted(dropped)
+            )
+        return TrainingArguments(**filtered)
+
+    @staticmethod
+    def _resolve_safe_max_length(model: PreTrainedModel, tokenizer: Optional[PreTrainedTokenizerBase]) -> int:
+        model_max = int(
+            getattr(model.config, "max_position_embeddings", None) or _HuggingFaceNerTrainerCommon.TRAINING_CONTEXT_CAP
+        )
+        tokenizer_max = int(
+            getattr(tokenizer, "model_max_length", model_max) if tokenizer is not None else model_max
+        )
+        return max(1, min(model_max, tokenizer_max, _HuggingFaceNerTrainerCommon.TRAINING_CONTEXT_CAP))
+
+    @staticmethod
+    def _calculate_batch_sizes(training_params: Dict, device_config: str) -> Dict[str, int]:
+        scaling_factor = max(1, int(training_params.get("scaling_factor", 1)))
+        cpu_count = os.cpu_count() or 1
+        effective_train_batch_size = 16
+        effective_eval_batch_size = 16
+        device = device_config.lower()
+        cuda_available = device.startswith(Device.GPU.value) and torch.cuda.is_available()
+        mps_available = device.startswith(Device.MPS.value) and torch.backends.mps.is_available()
+        accelerator_available = cuda_available or mps_available
+
+        if accelerator_available:
+            workers = max(1, min(4, cpu_count))
+            batch_size_cap = 16 if cuda_available else 8
+            micro_batch_size = max(1, scaling_factor * 2)
+        else:
+            workers = max(1, min(4, cpu_count // scaling_factor))
+            batch_size_cap = 16
+            micro_batch_size = max(1, cpu_count // workers)
+
+        per_device_train_batch_size = min(batch_size_cap, micro_batch_size)
+        per_device_eval_batch_size = min(batch_size_cap, micro_batch_size)
+        eval_accumulation_steps = max(1, math.ceil(effective_eval_batch_size / per_device_eval_batch_size))
+        gradient_accumulation_steps = max(1, math.ceil(effective_train_batch_size / per_device_train_batch_size))
+        return {
+            "workers": workers,
+            "per_device_train_batch_size": per_device_train_batch_size,
+            "per_device_eval_batch_size": per_device_eval_batch_size,
+            "eval_accumulation_steps": eval_accumulation_steps,
+            "gradient_accumulation_steps": gradient_accumulation_steps,
+        }
+
 
 @final
 class HuggingFaceNerUnsupervisedTrainer(UnsupervisedTrainer, _HuggingFaceNerTrainerCommon):
@@ -92,7 +156,7 @@ def __init__(self, model_service: "HuggingFaceNerModel") -> None:
             self._model_name.replace(" ", "_"),
         )
         self._model_manager = ModelManager(type(model_service), model_service._config)
-        self._max_length = model_service.model.config.max_position_embeddings
+        self._max_length = self._resolve_safe_max_length(model_service.model, model_service.tokenizer)
         os.makedirs(self._retrained_models_dir, exist_ok=True)
 
     def run(
@@ -125,7 +189,7 @@ def run(
 
         eval_mode = training_params["nepochs"] == 0
         window_size = max(self._max_length - 2, 1)
-        stride = max(window_size // 2, 1)
+        stride = min(max(int(window_size * 0.75), 1), window_size - 1)
         self._tracker_client.log_trainer_mode(not eval_mode)
         if not eval_mode:
             try:
@@ -194,22 +258,33 @@ def run(
                 train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
                 eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
 
+                training_token_count = sum(len(example["input_ids"]) for example in train_dataset)
+                logger.debug(f"Total training tokens: {training_token_count}")
+                self._tracker_client.log_training_token_count(training_token_count)
+
                 data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
 
-                training_args = TrainingArguments(
+                batch_sizes = self._calculate_batch_sizes(training_params, self._config.DEVICE)
+                per_device_train_batch_size = batch_sizes["per_device_train_batch_size"]
+                per_device_eval_batch_size = batch_sizes["per_device_eval_batch_size"]
+                gradient_accumulation_steps = batch_sizes["gradient_accumulation_steps"]
+                torch.set_num_threads(batch_sizes["workers"])
+
+                training_args = self._create_training_arguments(
                     output_dir=results_path,
                     logging_dir=logs_path,
                     eval_strategy="epoch",
                     save_strategy="epoch",
                     overwrite_output_dir=True,
                     num_train_epochs=training_params["nepochs"],
-                    per_device_train_batch_size=8,
-                    per_device_eval_batch_size=8,
-                    gradient_accumulation_steps=2,
+                    per_device_train_batch_size=per_device_train_batch_size,
+                    per_device_eval_batch_size=per_device_eval_batch_size,
+                    gradient_accumulation_steps=gradient_accumulation_steps,
                     logging_steps=log_frequency,
                     save_steps=1000,
                     load_best_model_at_end=True,
                     save_total_limit=3,
+                    report_to="none",
                     use_cpu=self._config.DEVICE.lower() == Device.CPU.value if non_default_device_is_available(self._config.DEVICE) else False,
                 )
 
@@ -245,7 +320,9 @@ def run(
                     model_pack_file_ext = get_model_data_package_extension(self._config.BASE_MODEL_FILE)
                     model_pack_file_name = f"{ModelType.HUGGINGFACE_NER.value}_{run_id}{model_pack_file_ext}"
                     retrained_model_pack_path = os.path.join(self._retrained_models_dir, model_pack_file_name)
-                    model.save_pretrained(
+                    save_model_to_clean_directory(
+                        model,
+                        tokenizer,
                         copied_model_directory,
                         safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
                     )
@@ -541,7 +618,7 @@ def __init__(self, model_service: "HuggingFaceNerModel") -> None:
         self._retrained_models_dir = os.path.join(model_service._model_parent_dir, "retrained",
                                                   self._model_name.replace(" ", "_"))
         self._model_manager = ModelManager(type(model_service), model_service._config)
-        self._max_length = model_service.model.config.max_position_embeddings
+        self._max_length = self._resolve_safe_max_length(model_service.model, model_service.tokenizer)
         os.makedirs(self._retrained_models_dir, exist_ok=True)
 
     class _LocalDataCollator:
@@ -590,9 +667,9 @@ def run(
         reset_random_seed()
         eval_mode = training_params["nepochs"] == 0
         window_size = max(self._max_length - 2, 1)
-        stride = max(window_size // 2, 1)
+        stride = min(max(int(window_size * 0.75), 1), window_size - 1)
         self._tracker_client.log_trainer_mode(not eval_mode)
-        tagging_scheme = TaggingScheme(self._model_service._config.TRAINING_HF_TAGGING_SCHEME.lower())
+        tagging_scheme = TaggingScheme(self._model_service._config.TRAINING_HF_NER_TAGGING_SCHEME.lower())
         if not eval_mode:
             try:
                 logger.info("Loading a new model copy for training...")
@@ -609,6 +686,9 @@ def run(
                 filtered_training_data, filtered_concepts = self._filter_training_data_and_concepts(data_file)
                 logger.debug(f"Filtered concepts: {filtered_concepts}")
                 model = self._update_model_with_concepts(model, filtered_concepts, tagging_scheme)
+                model = self._apply_lora_adapter_if_enabled(model)
+
+                self._freeze_params_or_classifier(model, self._config.TRAINING_HF_NER_FROZEN_PARAM_NAMES)
 
                 test_size = 0.2 if training_params.get("test_size") is None else training_params["test_size"]
                 if test_size < 0:
@@ -621,7 +701,6 @@ def run(
                 else:
                     documents = [document for project in filtered_training_data["projects"] for document in project["documents"]]
                     random.shuffle(documents)
-                    test_size = 0.2 if training_params.get("test_size") is None else training_params["test_size"]
                     train_documents = [document for document in documents[:int(len(documents) * (1 - test_size))]]
                     eval_documents = [document for document in documents[int(len(documents) * (1 - test_size)):]]
 
@@ -662,6 +741,10 @@ def run(
                 train_dataset.set_format(type=None, columns=["input_ids", "labels", "attention_mask"])
                 eval_dataset.set_format(type=None, columns=["input_ids", "labels", "attention_mask"])
 
+                training_token_count = sum(len(example["input_ids"]) for example in train_dataset)
+                logger.debug(f"Total training tokens: {training_token_count}")
+                self._tracker_client.log_training_token_count(training_token_count)
+
                 data_collator = self._LocalDataCollator(max_length=self._max_length, pad_token_id=tokenizer.pad_token_id)
                 training_args = self._get_training_args(results_path, logs_path, training_params, log_frequency)
                 if training_params.get("lr_override") is not None:
@@ -674,14 +757,20 @@ def run(
                 if early_stopping_patience > 0:
                     trainer_callbacks.append(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience))
 
-                train_labels = []
+                train_labels: List[str] = []
                 weights = torch.ones(model.num_labels, dtype=torch.float)
                 for example in train_dataset:
-                    train_labels.extend([label for label in example["labels"] if label != HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID])
+                    train_labels.extend(label for label in example["labels"] if label != HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID)
                 unique_labels = np.unique(train_labels)
-                class_weight_vect = compute_class_weight("balanced", classes=unique_labels, y=train_labels)
+                class_weight_vect = compute_class_weight(
+                    class_weight="balanced",
+                    classes=unique_labels,
+                    y=train_labels,
+                )
                 for label_id, weight in zip(unique_labels, class_weight_vect):
                     weights[label_id] = weight
+                weights = torch.sqrt(weights)
+                weights = torch.clamp(weights, max=10.0)
 
                 if non_default_device_is_available(self._config.DEVICE):
                     weights = weights.to(self._config.DEVICE)
@@ -694,7 +783,11 @@ def _compute_loss(
                     num_items_in_batch: Optional[torch.Tensor] = None,
                 ) -> torch.Tensor:
                     logits = outputs.get("logits")
-                    loss_func = nn.CrossEntropyLoss(weight=weights, ignore_index=HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID)
+                    loss_weights = weights.to(device=logits.device, dtype=logits.dtype) # type: ignore
+                    loss_func = nn.CrossEntropyLoss(
+                        weight=loss_weights,
+                        ignore_index=HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID,
+                    )
                     loss = loss_func(logits.view(-1, model.num_labels), labels.view(-1))    # type: ignore
                     return loss
 
@@ -729,12 +822,15 @@ def _compute_loss(
                 self._save_trained_concepts(cui_counts, cui_unique_counts, cui_ignorance_counts, model)
                 self._tracker_client.log_classes_and_names(model.config.id2label)
                 self._sanity_check_model_and_save_results(data_file.name, self._model_service.from_model(model, tokenizer))
+                model = self._merge_lora_if_enabled(model)
 
                 if not skip_save_model:
                     model_pack_file_ext = get_model_data_package_extension(self._config.BASE_MODEL_FILE)
                     model_pack_file_name = f"{ModelType.HUGGINGFACE_NER.value}_{run_id}{model_pack_file_ext}"
                     retrained_model_pack_path = os.path.join(self._retrained_models_dir, model_pack_file_name)
-                    model.save_pretrained(
+                    save_model_to_clean_directory(
+                        model,
+                        tokenizer,
                         copied_model_directory,
                         safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
                     )
@@ -803,8 +899,7 @@ def _compute_loss(
                 )
                 eval_dataset.set_format(type=None, columns=["input_ids", "labels", "attention_mask"])
                 data_collator = self._LocalDataCollator(max_length=self._max_length, pad_token_id=self._model_service.tokenizer.pad_token_id)
-                training_args = self._get_training_args(results_path, logs_path, training_params, log_frequency)
-                training_args.eval_strategy = "no"
+                training_args = self._get_training_args(results_path, logs_path, training_params, log_frequency, "no")
                 hf_trainer = Trainer(
                     model=self._model_service.model,
                     args=training_args,
@@ -837,6 +932,64 @@ def _compute_loss(
                 with self._training_lock:
                     self._training_in_progress = False
 
+    def _apply_lora_adapter_if_enabled(self, model: PreTrainedModel) -> PreTrainedModel:
+        if self._config.TRAINING_HF_NER_ENABLE_LORA.lower() != "true":
+            return model
+
+        logger.info("Applying LoRA adapters for supervised training...")
+        peft_model, _ = LoraAdaptor.apply(
+            model=model,
+            task_type="TOKEN_CLS",
+            r=8,
+            lora_alpha=32,
+            lora_dropout=0.1,
+        )
+        if hasattr(peft_model, "print_trainable_parameters"):
+            peft_model.print_trainable_parameters()
+        return cast(PreTrainedModel, peft_model)
+
+    def _merge_lora_if_enabled(self, model: PreTrainedModel) -> PreTrainedModel:
+        if self._config.TRAINING_HF_NER_ENABLE_LORA.lower() != "true":
+            return model
+
+        if hasattr(model, "merge_and_unload"):
+            logger.info("Merging LoRA adapters into the base model...")
+            merged_model = model.merge_and_unload()
+            return cast(PreTrainedModel, merged_model)
+        return model
+
+    @staticmethod
+    def _freeze_params_or_classifier(model: PreTrainedModel, params_names_csv: str) -> None:
+        param_names = [param_name.strip() for param_name in params_names_csv.split(",") if param_name.strip()]
+        if not param_names:
+            return
+
+        if "except_classifier" in param_names:
+            frozen_params, total_params = freeze_hf_model_params_by_names(
+                model=model,
+                params_names_csv="classifier,score",
+                include=False,
+            )
+            logger.info(
+                "Configured training on classification head only: %s; %s/%s parameter tensors remain trainable",
+                ["classifier", "score"],
+                total_params - frozen_params,
+                total_params,
+            )
+            return
+
+        frozen_params, total_params = freeze_hf_model_params_by_names(
+            model=model,
+            params_names_csv=params_names_csv,
+            include=True,
+        )
+        logger.info(
+            "Configured frozen parameters by names: %s; %s/%s parameter tensors remain trainable",
+            param_names,
+            total_params - frozen_params,
+            total_params,
+        )
+
     @staticmethod
     def _filter_training_data_and_concepts(data_file: TextIO) -> Tuple[Dict, List]:
         with open(data_file.name, "r") as f:
@@ -903,79 +1056,121 @@ def _compute_metrics(
         model_name: str,
         token_level: bool,
     ) -> Dict[str, Any]:
-        predictions = np.argmax(softmax(eval_pred.predictions, axis=2), axis=2)
+
+        predictions = np.argmax(eval_pred.predictions, axis=2)
         label_ids = eval_pred.label_ids
-        non_padding_indices = np.where(label_ids != HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID)
-        non_padding_predictions = predictions[non_padding_indices].flatten()
-        non_padding_label_ids = label_ids[non_padding_indices].flatten()
+
         labels = list(id2label.values())
+        ignored_labels = {"O", "X"}
+
+        metric_indices = [
+            idx for idx, label_id in enumerate(list(id2label.keys()))
+            if id2label[label_id] not in ignored_labels
+        ]
 
         if token_level:
             # Get token level metrics
-            precision, recall, f1, support = precision_recall_fscore_support(non_padding_label_ids, non_padding_predictions, labels=list(id2label.keys()), average=None)
-            filtered_predictions, filtered_label_ids = zip(*[(a, b) for a, b in zip(non_padding_predictions, non_padding_label_ids) if not (a == b == HuggingFaceNerSupervisedTrainer.DEFAULT_LABEL_ID)])
-            accuracy = sklearn_accuracy_score(filtered_label_ids, filtered_predictions)
+            pred_labels = []
+            true_labels = []
+
+            for i in range(label_ids.shape[0]):
+                for j in range(label_ids.shape[1]):
+                    if label_ids[i, j] == HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID:
+                        continue
+
+                    pred_labels.append(predictions[i, j])
+                    true_labels.append(label_ids[i, j])
+
+            precision, recall, f1, support = precision_recall_fscore_support(
+                np.array(true_labels),
+                np.array(pred_labels),
+                labels=list(id2label.keys()),
+                average=None
+            )
+
+            accuracy = sklearn_accuracy_score(np.array(true_labels), np.array(pred_labels))
+
             metrics = {
                 "accuracy": accuracy,
-                "f1_avg": np.average(f1[2:]),
-                "precision_avg": np.average(precision[2:]),
-                "recall_avg": np.average(recall[2:]),
-                "support_avg": np.average(support[2:]),
+                "f1_avg": np.average([f1[idx] for idx in metric_indices]) if metric_indices else 0.0,
+                "precision_avg": np.average([precision[idx] for idx in metric_indices]) if metric_indices else 0.0,
+                "recall_avg": np.average([recall[idx] for idx in metric_indices]) if metric_indices else 0.0,
+                "support_avg": np.average([support[idx] for idx in metric_indices]) if metric_indices else 0.0,
             }
+
             aggregated_labels = []
             aggregated_metrics = []
 
+            metric_rows = [
+                (labels[idx], precision[idx], recall[idx], f1[idx], support[idx])
+                for idx in metric_indices
+            ]
+
             # Limit the number of labels to avoid excessive metrics logging
-            for idx, (label, precision, recall, f1, support) in enumerate(zip(labels[2:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK+2],
-                                                                            precision[2:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK+2],
-                                                                            recall[2:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK+2],
-                                                                            f1[2:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK+2],
-                                                                            support[2:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK+2])):
-                if support == 0:  # The concept has no true labels
+            for _, (label, p, r, f1_, support_) in enumerate(
+                metric_rows[:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK]
+            ):
+                if support_ == 0:
                     continue
-                metrics[f"{label}/precision"] = precision if precision is not None else 0.0
-                metrics[f"{label}/recall"] = recall if recall is not None else 0.0
-                metrics[f"{label}/f1"] = f1 if f1 is not None else 0.0
-                metrics[f"{label}/support"] = support if support is not None else 0.0
+
+                metrics[f"{label}/precision"] = p
+                metrics[f"{label}/recall"] = r
+                metrics[f"{label}/f1"] = f1_
+                metrics[f"{label}/support"] = support_
 
                 aggregated_labels.append(label)
                 aggregated_metrics.append({
-                    "per_concept_p": metrics[f"{label}/precision"],
-                    "per_concept_r": metrics[f"{label}/recall"],
-                    "per_concept_f1": metrics[f"{label}/f1"],
+                    "per_concept_p": p,
+                    "per_concept_r": r,
+                    "per_concept_f1": f1_,
                 })
         else:
             # Get entity level metrics
-            y_true = []
-            y_pred = []
+            true_label_sequences = []
+            pred_label_sequences = []
+
             for i in range(label_ids.shape[0]):
-                true_labels = []
-                pred_labels = []
+
+                entity_true_labels: List = []
+                entity_pred_labels: List = []
+
                 for j in range(label_ids.shape[1]):
-                    if label_ids[i, j] != HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID:
-                        true_labels.append(id2label[label_ids[i, j]])
-                        pred_labels.append(id2label[predictions[i, j]])
-                    else:
+
+                    if label_ids[i, j] == HuggingFaceNerSupervisedTrainer.PAD_LABEL_ID:
                         break
-                y_true.append(true_labels)
-                y_pred.append(pred_labels)
-            report = classification_report(y_true, y_pred, output_dict=True)
-            accuracy = seqeval_accuracy_score(y_true, y_pred)
+
+                    entity_true_labels.append(id2label[label_ids[i, j]])
+                    entity_pred_labels.append(id2label[predictions[i, j]])
+
+                true_label_sequences.append(entity_true_labels)
+                pred_label_sequences.append(entity_pred_labels)
+
+            report = classification_report(true_label_sequences, pred_label_sequences, output_dict=True)
+            accuracy = seqeval_accuracy_score(true_label_sequences, pred_label_sequences)
+
+            target_labels = [
+                key for key in report.keys()
+                if key not in {"weighted avg", "macro avg", "micro avg", "accuracy"}
+                and key not in ignored_labels
+            ]
+
             metrics = {
                 "accuracy": accuracy,
-                "f1_avg": np.mean([report[label]["f1-score"] for label in report]),
-                "precision_avg": np.mean([report[label]["precision"] for label in report]),
-                "recall_avg": np.mean([report[label]["recall"] for label in report]),
-                "support_avg": np.mean([report[label]["support"] for label in report]),
+                "f1_avg": np.mean([report[label]["f1-score"] for label in target_labels]) if target_labels else 0.0,
+                "precision_avg": np.mean([report[label]["precision"] for label in target_labels]) if target_labels else 0.0,
+                "recall_avg": np.mean([report[label]["recall"] for label in target_labels]) if target_labels else 0.0,
+                "support_avg": np.mean([report[label]["support"] for label in target_labels]) if target_labels else 0.0,
             }
+
             aggregated_labels = []
             aggregated_metrics = []
 
             # Limit the number of labels to avoid excessive metrics logging
-            label_keys = [k for k in report.keys() if k not in ['weighted avg', 'macro avg', 'micro avg']]
-            for _, label in enumerate(label_keys[:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK]):
-                if label not in report or report[label]['support'] == 0:  # The label has no true labels
+            for label in target_labels[:HuggingFaceNerSupervisedTrainer.MAX_CONCEPTS_TO_TRACK]:
+
+                if label not in report or report[label]["support"] == 0:
                     continue
+
                 metrics[f"{label}/precision"] = report[label]["precision"]
                 metrics[f"{label}/recall"] = report[label]["recall"]
                 metrics[f"{label}/f1"] = report[label]["f1-score"]
@@ -983,9 +1178,9 @@ def _compute_metrics(
 
                 aggregated_labels.append(label)
                 aggregated_metrics.append({
-                    "per_concept_p": metrics[f"{label}/precision"],
-                    "per_concept_r": metrics[f"{label}/recall"],
-                    "per_concept_f1": metrics[f"{label}/f1"],
+                    "per_concept_p": report[label]["precision"],
+                    "per_concept_r": report[label]["recall"],
+                    "per_concept_f1": report[label]["f1-score"],
                 })
 
         HuggingFaceNerSupervisedTrainer._save_metrics_plot(
@@ -994,6 +1189,7 @@ def _compute_metrics(
             tracker_client,
             model_name,
         )
+
         logger.debug("Evaluation metrics: %s", metrics)
         return metrics
 
@@ -1021,19 +1217,26 @@ def _get_training_args(
         logs_path: str,
         training_params: Dict,
         log_frequency: int,
+        eval_strategy: str = "epoch",
     ) -> TrainingArguments:
-        scaling_factor = 2
-        cpu_count = os.cpu_count() or 1
-        effective_batch_size = cpu_count * scaling_factor
-        workers = max(1, cpu_count // scaling_factor)
-        per_device_train_batch_size = max(1, effective_batch_size // workers)
-        per_device_eval_batch_size = max(1, effective_batch_size // workers)
-        eval_accumulation_steps = max(1, per_device_eval_batch_size // scaling_factor)
+        batch_sizes = self._calculate_batch_sizes(training_params, self._config.DEVICE)
+        workers = batch_sizes["workers"]
+        per_device_train_batch_size = batch_sizes["per_device_train_batch_size"]
+        per_device_eval_batch_size = batch_sizes["per_device_eval_batch_size"]
+        eval_accumulation_steps = batch_sizes["eval_accumulation_steps"]
+        gradient_accumulation_steps = batch_sizes["gradient_accumulation_steps"]
         torch.set_num_threads(workers)
-        return TrainingArguments(
+        logger.debug("Training scaling arguments:")
+        logger.debug("  - CPU workers: %d", workers)
+        logger.debug("  - Per device train batch size: %d", per_device_train_batch_size)
+        logger.debug("  - Per device eval batch size: %d", per_device_eval_batch_size)
+        logger.debug("  - Eval accumulation steps: %d", eval_accumulation_steps)
+        logger.debug("  - Gradient accumulation steps: %d", gradient_accumulation_steps)
+
+        return self._create_training_arguments(
             output_dir=results_path,
             logging_dir=logs_path,
-            eval_strategy="epoch",
+            eval_strategy=eval_strategy,
             do_eval=True,
             save_strategy="epoch",
             logging_strategy="epoch",
@@ -1042,15 +1245,17 @@ def _get_training_args(
             per_device_train_batch_size=per_device_train_batch_size,
             per_device_eval_batch_size=per_device_eval_batch_size,
             eval_accumulation_steps=eval_accumulation_steps,
-            gradient_accumulation_steps=1,
+            gradient_accumulation_steps=gradient_accumulation_steps,
             weight_decay=0.01,
             warmup_ratio=0.08,
+            max_grad_norm=1.0,
             logging_steps=log_frequency,
             save_steps=1000,
             metric_for_best_model="eval_f1_avg",
             greater_is_better=True,
             load_best_model_at_end=True,
             save_total_limit=3,
+            report_to="none",
             use_cpu=self._config.DEVICE.lower() == Device.CPU.value if non_default_device_is_available(self._config.DEVICE) else False,
         )
 
@@ -1062,7 +1267,17 @@ def _save_trained_concepts(
         model: PreTrainedModel,
     ) -> None:
         if len(training_concepts.keys()) != 0:
-            unknown_concepts = set(training_concepts.keys()) - set(model.config.label2id.keys())
+            labels = set(model.config.label2id.keys())
+            model_concepts = set()
+            for label in labels:
+                if label in {"O", "X"}:
+                    continue
+                if len(label) > 2 and label[1] == "-" and label[0] in {"B", "I", "E", "S"}:
+                    model_concepts.add(label[2:])
+                else:
+                    model_concepts.add(label)
+
+            unknown_concepts = set(training_concepts.keys()) - model_concepts
             unknown_concept_pct = round(len(unknown_concepts) / len(training_concepts.keys()) * 100, 2)
             self._tracker_client.send_model_stats({
                 "unknown_concept_count": len(unknown_concepts),
diff --git a/app/trainers/medcat_deid_trainer.py b/app/trainers/medcat_deid_trainer.py
index 34d27490..577b9d6a 100644
--- a/app/trainers/medcat_deid_trainer.py
+++ b/app/trainers/medcat_deid_trainer.py
@@ -95,7 +95,7 @@ def run(
                 if training_params.get("test_size") is not None:
                     ner.config.general.test_size = training_params["test_size"]
                 # This default evaluation strategy is "epoch"
-                # ner.training_arguments.evaluation_strategy = "steps"
+                # ner.training_arguments.eval_strategy = "steps"
                 # ner.training_arguments.eval_steps = 1
                 logger.info("Performing supervised training...")
                 model.config.meta.description = description or model.config.meta.description
diff --git a/app/utils.py b/app/utils.py
index 4768248a..f5a57bf8 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -1,3 +1,5 @@
+import shutil
+
 import json
 import socket
 import random
@@ -13,6 +15,8 @@
 import torch
 import tarfile
 import zipfile
+import re
+import uuid
 import numpy as np
 import pandas as pd
 from packaging.markers import Marker
@@ -20,12 +24,20 @@
 from spacy.lang.en import English
 from spacy.util import filter_spans
 from safetensors.torch import load_file
-from transformers import PreTrainedModel, PreTrainedTokenizer
+from transformers import (
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PretrainedConfig,
+    BitsAndBytesConfig,
+    AutoModel,
+    AutoTokenizer,
+)
 from urllib.parse import ParseResult
 from functools import lru_cache
-from typing import List, Optional, Dict, Callable, Any, Union, Type, TypeVar
+from typing import List, Optional, Dict, Callable, Any, Union, Type, TypeVar, Tuple
 from app.config import Settings
-from app.domain import Annotation, Entity, CodeType, ModelType, Device, PromptMessage, PromptRole
+from app.domain import Annotation, Entity, CodeType, ModelType, Device, PromptMessage, PromptRole, OpenAIFunctionTool
 from app.exception import ManagedModelException
 from app.processors.prompt_factory import PromptFactory
 
@@ -38,10 +50,16 @@ def get_settings() -> Settings:
     Returns:
         Settings: An instance of the configuration settings.
     """
+    if torch.cuda.is_available():
+        torch.cuda.set_per_process_memory_fraction(0.9)
 
     settings = Settings()
     os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ["MKL_NUM_THREADS"] = "1"
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     if settings.SYSTEM_METRICS_LOGGING_INTERVAL_SECONDS > 0:
         os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
         os.environ["MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL"] = str(settings.SYSTEM_METRICS_LOGGING_INTERVAL_SECONDS)
@@ -239,7 +257,7 @@ def filter_by_concept_ids(
             if extra_excluded is not None and len(extra_excluded) > 0:
                 document["annotations"] = [anno for anno in document.get("annotations", []) if anno.get("cui") not in extra_excluded]
 
-    if model_type in [ModelType.TRANSFORMERS_DEID, ModelType.MEDCAT_DEID, ModelType.ANONCAT]:
+    if model_type in [ModelType.TRANSFORMERS_DEID, ModelType.MEDCAT_DEID, ModelType.ANONCAT, ModelType.HUGGINGFACE_NER]:
         # special preprocessing for the DeID annotations and consider removing this.
         for project in filtered["projects"]:
             for document in project["documents"]:
@@ -792,6 +810,59 @@ def download_model_package(
             retry_delay *= 2
 
 
+def quantize_and_save_model(
+    hf_model_path: str,
+    output_model_path: Optional[str] = None,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = True,
+) -> str:
+    """
+    Quantises and saves a Hugging Face model using the specified precision.
+
+    Args:
+        hf_model_path (str): The path to the Hugging Face model to be quantised.
+        output_model_path (str): The path where the quantised model will be saved.
+        load_in_4bit (bool): Whether to quantise the model in 4-bit precision. Defaults to False.
+        load_in_8bit (bool): Whether to quantise the model in 8-bit precision. Defaults to True.
+
+    Returns:
+        str: The path to the quantised model.
+
+    Raises:
+        ManagedModelException: If there is an error during quantisation or saving of the model.
+    """
+
+    try:
+        if load_in_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+            )
+        elif load_in_8bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_4bit_compute_dtype=torch.float16 if has_turing_generation_gpu() else torch.bfloat16,
+                llm_int8_threshold=6.0,
+                llm_int8_enable_fp32_cpu_offload=False
+            )
+        else:
+            bnb_config = None
+        if bnb_config is not None:
+            model = AutoModel.from_pretrained(
+                hf_model_path,
+                quantization_config=bnb_config,
+                device_map="auto"
+            )
+            tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
+            model.save_pretrained(output_model_path if output_model_path is not None else hf_model_path)
+            tokenizer.save_pretrained(output_model_path if output_model_path is not None else hf_model_path)
+        return hf_model_path if output_model_path is None else output_model_path
+    except Exception as e:
+        raise ManagedModelException(f"Error during quantisation and saving of the model: {e}")
+
+
 def get_default_chat_template() -> str:
     """
     Gets the default chat template.
@@ -860,9 +931,12 @@ def get_default_system_prompt() -> str:
 
 
 def get_prompt_from_messages(
-        tokenizer: PreTrainedTokenizer,
-        messages: List[PromptMessage],
-        override_template: Optional[str] = None,
+    tokenizer: PreTrainedTokenizer,
+    messages: List[PromptMessage],
+    tools: Optional[List[Union[OpenAIFunctionTool, Dict[Any, Any]]]] = None,
+    override_template: Optional[str] = None,
+    max_input_tokens: Optional[int] = None,
+    add_generation_prompt: bool = True,
 ) -> str:
     """
     Generates a prompt from a list of prompt messages.
@@ -870,30 +944,60 @@ def get_prompt_from_messages(
     Args:
         tokenizer (PreTrainedTokenizer): The tokenizer to use for applying the chat template.
         messages (List[PromptMessage]): The list of prompt messages to use for generating the prompt.
+        tools (Optional[List[OpenAIFunctionTool]]): An optional list of tools to include in the prompt.
         override_template (str): The name of the chat template to use for generating the prompt.
+        max_input_tokens (Optional[int]): The maximum number of input tokens to include in the prompt.
+        add_generation_prompt (bool): Whether or not to include the generation prompt.
 
     Returns:
         str: The generated prompt.
     """
-    if override_template is None:
-        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template(
-                [dump_pydantic_object_to_dict(message) for message in messages],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
-        elif hasattr(tokenizer, "default_chat_template") and tokenizer.default_chat_template:
-            # This largely depends on how older versions of HF tokenizers behave and may not work universally
-            tokenizer.chat_template = tokenizer.default_chat_template
-            prompt = tokenizer.apply_chat_template(
-                [dump_pydantic_object_to_dict(message) for message in messages],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
-        else:
+    def _build_prompt(
+        prompt_messages: List[PromptMessage],
+        tools: Optional[List[Union[OpenAIFunctionTool, Dict[Any, Any]]]],
+    ) -> str:
+        if override_template is None:
+            if all([
+                hasattr(tokenizer, "apply_chat_template"),
+                hasattr(tokenizer, "chat_template"),
+                tokenizer.chat_template,
+            ]):
+                tool_payloads = None
+                if tools is not None:
+                    tool_payloads = [
+                        dump_pydantic_object_to_dict(tool) if not isinstance(tool, dict) else tool
+                        for tool in tools
+                    ]
+                return tokenizer.apply_chat_template(
+                    [dump_pydantic_object_to_dict(message) for message in prompt_messages],
+                    tools=tool_payloads,
+                    tokenize=False,
+                    add_generation_prompt=add_generation_prompt,
+                    enable_thinking=False,
+                )
+            if all([
+                hasattr(tokenizer, "apply_chat_template"),
+                hasattr(tokenizer, "default_chat_template"),
+                tokenizer.default_chat_template,
+            ]):
+                # This largely depends on how older versions of HF tokenizers behave and may not work universally
+                tokenizer.chat_template = tokenizer.default_chat_template
+                tool_payloads = None
+                if tools is not None:
+                    tool_payloads = [
+                        dump_pydantic_object_to_dict(tool) if not isinstance(tool, dict) else tool
+                        for tool in tools
+                    ]
+                return tokenizer.apply_chat_template(
+                    [dump_pydantic_object_to_dict(message) for message in prompt_messages],
+                    tools=tool_payloads,
+                    tokenize=False,
+                    add_generation_prompt=add_generation_prompt,
+                    enable_thinking=False,
+                )
             system_content = ""
             prompt_parts: List[str] = []
-            for message in messages:
+            for message in prompt_messages:
                 content = message.content.strip()
                 if message.role == PromptRole.SYSTEM:
                     system_content = content
@@ -905,16 +1009,310 @@ def get_prompt_from_messages(
                 prompt = f"<|system|>\n{system_content}</s>\n" + "\n".join(prompt_parts)
             else:
                 prompt = "\n".join(prompt_parts)
-            prompt += "\n<|assistant|>\n"
-    else:
+            if add_generation_prompt:
+                return prompt + "\n<|assistant|>\n"
+            return prompt
+
         tokenizer.chat_template = PromptFactory.create_chat_template(tmpl_name=override_template)
-        prompt = tokenizer.apply_chat_template(
-            [dump_pydantic_object_to_dict(message) for message in messages],
+        return tokenizer.apply_chat_template(
+            [dump_pydantic_object_to_dict(message) for message in prompt_messages],
             tokenize=False,
-            add_generation_prompt=True,
+            add_generation_prompt=add_generation_prompt,
+            enable_thinking=False,
+        )
+
+    prompt = _build_prompt(messages, tools)
+    if max_input_tokens is None:
+        return prompt
+
+    truncated_messages = list(messages)
+    system_msg_detected = bool(truncated_messages and truncated_messages[0].role == PromptRole.SYSTEM)
+
+    while len(tokenizer.encode(prompt, add_special_tokens=False)) > max_input_tokens:
+        start_idx = 1 if system_msg_detected else 0
+        assistant_idx = next(
+            (
+                idx
+                for idx, message in enumerate(truncated_messages[start_idx:], start=start_idx)
+                if message.role == PromptRole.ASSISTANT
+            ),
+            None,
         )
+        if assistant_idx is None:
+            break
+        delete_end = assistant_idx + 1
+        if delete_end < len(truncated_messages) and truncated_messages[delete_end].role == PromptRole.TOOL:
+            delete_end += 1
+        del truncated_messages[start_idx:delete_end]
+        prompt = _build_prompt(truncated_messages, tools)
+
     return prompt
 
+def extract_tool_calls(text: str) -> List[Dict[str, Any]]:
+    """Extracts tool calls from the generated text.
+
+    Arguments:
+        text (str): The text to extract the tool calls from.
+
+    Returns:
+        List[Dict[str, Any]]: A list of tool calls.
+    """
+    mistral_match = re.search(r"\[TOOL_CALLS\]\s*\[", text)
+    if mistral_match:
+        json_start = mistral_match.end() - 1
+        try:
+            decoder = json.JSONDecoder()
+            tool_calls, _ = decoder.raw_decode(text, json_start)
+            results: List[Dict[str, Any]] = []
+            for tool_call in tool_calls:
+                name = tool_call.get("name")
+                arguments = tool_call.get("arguments", {})
+                call_id = tool_call.get("id") or f"call_{uuid.uuid4().hex[:9]}"
+                results.append({
+                    "id": call_id,
+                    "type": "function",
+                    "function": {
+                        "name": name,
+                        "arguments": json.dumps(arguments),
+                    },
+                })
+            return results
+        except Exception:
+            pass
+
+    deepseek_match = re.search(r"<｜tool▁call▁begin｜>", text)
+    if deepseek_match:
+        sep_idx = text.find("<｜tool▁sep｜>", deepseek_match.end())
+        if sep_idx != -1:
+            name_start = sep_idx + len("<｜tool▁sep｜>")
+            name_end = text.find("\n", name_start)
+            if name_end != -1:
+                name = text[name_start:name_end].strip()
+                json_block_start = text.find("```json", name_end)
+                if json_block_start != -1:
+                    json_start = text.find("{", json_block_start)
+                    if json_start != -1:
+                        try:
+                            decoder = json.JSONDecoder()
+                            arguments, _ = decoder.raw_decode(text, json_start)
+                            return [{
+                                "id": f"call_{uuid.uuid4().hex[:9]}",
+                                "type": "function",
+                                "function": {
+                                    "name": name,
+                                    "arguments": json.dumps(arguments),
+                                },
+                            }]
+                        except Exception:
+                            pass
+
+    gpt_oss_matches = list(re.finditer(r"functions\.(?P<name>[A-Za-z0-9_]+)\s+json", text))
+    if gpt_oss_matches:
+        results = []
+        for match in gpt_oss_matches:
+            name = match.group("name")
+            json_start = text.find("{", match.end())
+            if json_start == -1:
+                continue
+            depth = 0
+            json_end = None
+            for idx in range(json_start, len(text)):
+                char = text[idx]
+                if char == "{":
+                    depth += 1
+                elif char == "}":
+                    depth -= 1
+                    if depth == 0:
+                        json_end = idx + 1
+                        break
+            if json_end is None:
+                continue
+            try:
+                args = json.loads(text[json_start:json_end])
+                results.append({
+                    "id": f"call_{uuid.uuid4().hex}",
+                    "type": "function",
+                    "function": {
+                        "name": name,
+                        "arguments": json.dumps(args),
+                    },
+                })
+            except Exception:
+                continue
+        return results
+
+    return []
+
+
+def extract_json_string(text: str) -> str:
+    """ Extract JSON string from the generated text
+
+    Arguments:
+        text (str): The text to extract the tool call from.
+
+    Returns:
+        str: A sanitised JSON string.
+    """
+    if not text:
+        return text
+    start = text.find("{")
+    if start == -1:
+        return text.strip()
+    stack = 0
+    try:
+        for idx in range(start, len(text)):
+            ch = text[idx]
+            if ch == "{":
+                stack += 1
+            elif ch == "}":
+                if stack > 0:
+                    stack -= 1
+                    if stack == 0:
+                        candidate = text[start:idx + 1]
+                        for char in ["\n", "\r", "\t"]:
+                            candidate = candidate.replace(char, "")
+                        parsed = json.loads(candidate)
+                        return json.dumps(parsed, separators=(",", ":"))
+    except Exception:
+        return text[start:].strip()
+    return text[start:].strip()
+
+
+def has_turing_generation_gpu() -> bool:
+    """Checks if the GPU is from the Turing generation"""
+    if torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability()
+        if (major, minor) < (8, 0):
+            return True
+    return False
+
+
+def resolve_safe_max_model_length(config: PretrainedConfig) -> int:
+    """
+    Resolves safe max model length across config variants.
+
+    Arguments:
+        config: PretrainedConfig: the Hugging Face model config object
+
+    Returns:
+        int: the value of the safe max model length
+    """
+    value = getattr(config, "max_position_embeddings", None)
+    if isinstance(value, int) and value > 0:
+        return value
+
+    text_config = getattr(config, "text_config", None)
+    text_value = getattr(text_config, "max_position_embeddings", None) if text_config is not None else None
+    if isinstance(text_value, int) and text_value > 0:
+        return text_value
+
+    seq_len = getattr(config, "seq_length", None)
+    if isinstance(seq_len, int) and seq_len > 0:
+        return seq_len
+
+    return 512
+
+def parse_label_into_id_and_name(label: Optional[str], delimiter: str = "|") -> Tuple[Optional[str], Optional[str]]:
+    """
+    Parses a single label in to a pair of label id and label name by the given delimiter.
+
+    Args:
+        label (Optional[str]): A single label string as the input
+        delimiter (str): The delimiter used for separating the label id and the label name.
+
+    Returns:
+         Tuple[Optional[str], Optional[str]]: A pair of label id and label name.
+    """
+    if label is None:
+        return None, None
+    if delimiter in label:
+        label_id, label_name = label.split(delimiter, 1)
+    else:
+        label_id = label
+        label_name = label.replace("-", " ").replace("_", " ").title()
+    return label_id, label_name
+
+
+def freeze_hf_model_params_by_names(
+    model: PreTrainedModel,
+    params_names_csv: str,
+    include: bool = True,
+) -> Tuple[int, int]:
+    """
+    Freezes the parameters of a Hugging Face model based on their names or regex patterns in CSV.
+
+    Args:
+        model (PreTrainedModel): The Hugging Face model to freeze parameters in.
+        params_names_csv (str): A CSV string of parameter name prefixes or regex patterns to freeze or unfreeze.
+        include (bool): Whether to freeze parameters with names (True) or to freeze others parameters (False). Defaults to True.
+
+    Returns:
+        Tuple[int, int]: A tuple containing the number of frozen parameters and the total number of parameters in the model.
+    """
+
+    frozen_params = 0
+    total_params = sum(1 for _ in model.named_parameters())
+    param_names = [param_name.strip() for param_name in params_names_csv.split(",") if param_name.strip()]
+    if not param_names:
+        return frozen_params, total_params
+
+    compiled_patterns: List[Optional[re.Pattern[str]]] = []
+    for pattern in param_names:
+        try:
+            compiled_patterns.append(re.compile(pattern))
+        except re.error:
+            compiled_patterns.append(None)
+
+    for name, param in model.named_parameters():
+        if include:
+            if any(
+                pattern.search(name) if pattern is not None else prefix in name
+                for prefix, pattern in zip(param_names, compiled_patterns)
+            ):
+                if param.requires_grad:
+                    param.requires_grad = False
+                    frozen_params += 1
+        else:
+            if not any(
+                pattern.search(name) if pattern is not None else prefix in name
+                for prefix, pattern in zip(param_names, compiled_patterns)
+            ):
+                if param.requires_grad:
+                    param.requires_grad = False
+                    frozen_params += 1
+
+    return frozen_params, total_params
+
+
+def save_model_to_clean_directory(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizerBase,
+    model_directory: str,
+    safe_serialization: bool,
+) -> None:
+    """
+    Saves the Hugging Face model and tokenizer to a clean directory and ensures the emptiness before saving.
+
+    Args:
+        model (PreTrainedModel): The Hugging Face model to save.
+        tokenizer (PreTrainedTokenizerBase): The Hugging Face tokenizer to save.
+        model_directory (str): The directory where the model and tokenizer will be saved.
+        safe_serialization (bool): Whether to use safe serialization when saving the model.
+    """
+    if os.path.isdir(model_directory):
+        for entry in os.listdir(model_directory):
+            entry_path = os.path.join(model_directory, entry)
+            if os.path.isdir(entry_path):
+                shutil.rmtree(entry_path)
+            else:
+                os.remove(entry_path)
+    os.makedirs(model_directory, exist_ok=True)
+    model.save_pretrained(
+        model_directory,
+        safe_serialization=safe_serialization,
+    )
+    tokenizer.save_pretrained(model_directory)
+
 
 TYPE_ID_TO_NAME_PATCH = {
     "32816260": "physical object",
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 51689618..8f768545 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,15 +32,15 @@ ENV TZ=Etc/UTC
 ARG IMAGE_TYPE
 
 RUN if [ "$IMAGE_TYPE" = "gpu" ]; then \
-        apt-get update && apt-get install -y --no-install-recommends \
+        rm -rf /var/lib/apt/lists/* && apt-get update && apt-get install -y --no-install-recommends \
         nano telnet software-properties-common && \
         add-apt-repository ppa:deadsnakes/ppa -y && \
         apt-get update && \
-        apt-get install -y --no-install-recommends python3.11 python3.11-dev python3.11-venv python3-pip && \
+        apt-get install -y --no-install-recommends python3.11 python3.11-dev python3.11-venv python3-pip git curl && \
         rm -rf /var/lib/apt/lists/*; \
     else \
-        apt-get update && apt-get install -y --no-install-recommends \
-        nano telnet build-essential g++ && \
+        rm -rf /var/lib/apt/lists/* && apt-get update && apt-get install -y --no-install-recommends \
+        nano telnet build-essential g++ git && \
         rm -rf /var/lib/apt/lists/*; \
     fi
 
@@ -61,7 +61,10 @@ COPY uv-requirements.txt /uv-requirements.txt
 COPY docker/entrypoint/serve.sh /app/entrypoint.sh
 RUN mkdir -p /app/model/model && \
     mkdir -p /app/model/retrained && \
-    chown -R $CMS_UID:$CMS_GID /app
+    chown -R $CMS_UID:$CMS_GID /app && \
+    chown $CMS_UID:$CMS_GID /pyproject.toml && \
+    chown $CMS_UID:$CMS_GID /uv.lock && \
+    chown $CMS_UID:$CMS_GID /uv-requirements.txt
 
 RUN python3.11 -m pip install --require-hashes -r /uv-requirements.txt --no-cache-dir
 RUN if [ -z "$PIP_EXTRAS" ]; then \
diff --git a/docker/entrypoint/serve.sh b/docker/entrypoint/serve.sh
index 34d5bfff..53b02a76 100644
--- a/docker/entrypoint/serve.sh
+++ b/docker/entrypoint/serve.sh
@@ -22,10 +22,10 @@ else
     exit 1
 fi
 
-if [ "${CMS_STREAMABLE}" = "true" ]; then
-    streamable="--streamable"
+if [ -z "${CMS_SERVE_EXTRA_OPTIONS}" ]; then
+    extra_opts=""
 else
-    streamable=""
+    extra_opts="${CMS_SERVE_EXTRA_OPTIONS}"
 fi
 
 source /.venv/bin/activate
@@ -36,4 +36,4 @@ exec /.venv/bin/python cli/cli.py serve \
   --model-path "${CMS_MODEL_FILE}" \
   --host 0.0.0.0 \
   --port 8000 \
-  $streamable
\ No newline at end of file
+  $extra_opts
\ No newline at end of file
diff --git a/docker/huggingface-llm/.env b/docker/huggingface-llm/.env
index a2813001..3e05f1fb 100644
--- a/docker/huggingface-llm/.env
+++ b/docker/huggingface-llm/.env
@@ -1,4 +1,6 @@
 ENABLE_TRAINING_APIS=true
 ENABLE_EVALUATION_APIS=true
 ENABLE_PREVIEWS_APIS=true
-LOG_PER_CONCEPT_ACCURACIES=true
\ No newline at end of file
+LOG_PER_CONCEPT_ACCURACIES=true
+# If "true", attempt to use flash attention v2 for HuggingFace LLM loading
+ENABLE_FLASH_ATTN_2=false
diff --git a/docker/mlflow/deployments/requirements.txt b/docker/mlflow/deployments/requirements.txt
index 648fe1a0..5be55f29 100644
--- a/docker/mlflow/deployments/requirements.txt
+++ b/docker/mlflow/deployments/requirements.txt
@@ -1,3 +1,3 @@
-mlflow[genai]~=2.16.2
+mlflow[genai]~=2.22.1
 setuptools
 wheel
\ No newline at end of file
diff --git a/docker/mlflow/models/requirements.txt b/docker/mlflow/models/requirements.txt
index d36d973b..0afd85ec 100644
--- a/docker/mlflow/models/requirements.txt
+++ b/docker/mlflow/models/requirements.txt
@@ -1,4 +1,4 @@
-mlflow~=2.16.2
+mlflow~=2.22.1
 psycopg2-binary~=2.9.4
 boto3~=1.28.84
 setuptools
diff --git a/docker/mlflow/server/requirements.txt b/docker/mlflow/server/requirements.txt
index d36d973b..0afd85ec 100644
--- a/docker/mlflow/server/requirements.txt
+++ b/docker/mlflow/server/requirements.txt
@@ -1,4 +1,4 @@
-mlflow~=2.16.2
+mlflow~=2.22.1
 psycopg2-binary~=2.9.4
 boto3~=1.28.84
 setuptools
diff --git a/docker/monitoring/grafana/provisioning/dashboards/cms_vllm_serving.json b/docker/monitoring/grafana/provisioning/dashboards/cms_vllm_serving.json
new file mode 100644
index 00000000..594432f0
--- /dev/null
+++ b/docker/monitoring/grafana/provisioning/dashboards/cms_vllm_serving.json
@@ -0,0 +1,1557 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Monitoring vLLM Inference Server",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 1,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "End to end request latency measured in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 9,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "E2E Request Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of tokens processed per second",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Prompt Tokens/Sec",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Generation Tokens/Sec",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Token Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Inter token latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Mean",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time Per Output Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Running",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Swapped",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Waiting",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        }
+      ],
+      "title": "Scheduler State",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time To First Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Percentage of used cache blocks by vLLM.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "instant": false,
+          "legendFormat": "GPU Cache Usage",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "CPU Cache Usage",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Cache Utilization",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Heatmap of request prompt length",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "id": 12,
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
+        },
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Prompt Length",
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "none"
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Prompt Length",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Heatmap of request generation length",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 24
+      },
+      "id": 13,
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
+        },
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Generation Length",
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "none"
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Generation Length",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 32
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Finish Reason",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "seconds",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Queue Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 40
+      },
+      "id": 15,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Prefill",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Decode",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Requests Prefill and Decode Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 40
+      },
+      "id": 16,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Tokens",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Generation Token in Sequence Group",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "edx8memhpd9tsa"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct",
+          "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "label_values(model_name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "model_name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "query": "label_values(model_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "vLLM",
+  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
+  "version": 8,
+  "weekStart": ""
+}
\ No newline at end of file
diff --git a/docker/nginx/etc/nginx/nginx.conf b/docker/nginx/etc/nginx/nginx.conf
index bed0f58c..92aaf59a 100644
--- a/docker/nginx/etc/nginx/nginx.conf
+++ b/docker/nginx/etc/nginx/nginx.conf
@@ -15,10 +15,17 @@ http {
     default_type application/octet-stream;
     client_max_body_size 500M;
 
+    map $http_x_forwarded_proto $forwarded_proto {
+        default https;
+        "~*https" https;
+    }
+
     server {
         listen 443 ssl http2;
         listen [::]:443 ssl http2;
         server_name localhost;
+        absolute_redirect off;
+        port_in_redirect on;
 
         add_header Strict-Transport-Security "max-age=31536000" always;
 
@@ -44,18 +51,18 @@ http {
             return 200 "OK\n";
         }
 
+        location ~ ^/cms/(?<service>[^/]+)$ {
+            return 307 /cms/$service/docs;
+        }
+
+        location ~ ^/cms/(?<service>[^/]+)/$ {
+            return 307 /cms/$service/docs;
+        }
+
         location ~ ^/cms/(?<service>[^/]+)(?<subpath>/.*)?$ {
             include cors.conf;
             resolver 127.0.0.11 valid=30s;
             set $upstream $service:8000;
-            set $path $subpath;
-            if ($subpath = "") {
-                set $path /;
-            }
-            set $query_string_part "";
-            if ($is_args) {
-                set $query_string_part $is_args$args;
-            }
 
             # FIXME: Access web interfaces (e.g. Grafana, MLflow) through subpaths on the proxy.
             # The following services only work when accessed directly through their respective APIs.
@@ -84,17 +91,25 @@ http {
                 set $upstream $service:9090;
             }
 
-            proxy_pass http://$upstream$path$query_string_part;
+            proxy_pass http://$upstream$subpath;
 
-            proxy_redirect http://$upstream$subpath $scheme://$host/cms/$service$subpath;
-            proxy_redirect http://$upstream/ $scheme://$host/cms/$service/;
-            proxy_redirect http://$upstream $scheme://$host/cms/$service;
-            proxy_redirect / $scheme://$host/cms/$service/;
+            proxy_redirect http://$upstream$subpath https://$http_host/cms/$service$subpath;
+            proxy_redirect http://$upstream/ https://$http_host/cms/$service/;
+            proxy_redirect http://$upstream https://$http_host/cms/$service;
+            proxy_redirect / https://$http_host/cms/$service/;
 
-            proxy_set_header Host $host;
+            proxy_set_header Host $http_host;
             proxy_set_header X-Real-IP $remote_addr;
             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_set_header X-Forwarded-Proto https;
+            proxy_set_header X-Forwarded-Prefix /cms/$service;
+
+            proxy_read_timeout 300s;
+            proxy_connect_timeout 60s;
+            proxy_send_timeout 300s;
+
+            proxy_buffering off;
+            proxy_cache off;
 
             error_page 502 503 504 = @fallback;
         }
diff --git a/docker/nginx/etc/nginx/sites-enabled/huggingface-llm b/docker/nginx/etc/nginx/sites-enabled/huggingface-llm
new file mode 100644
index 00000000..671302e3
--- /dev/null
+++ b/docker/nginx/etc/nginx/sites-enabled/huggingface-llm
@@ -0,0 +1,47 @@
+server {
+    listen                    28186 ssl http2 default_server;
+    listen                    [::]:28186 ssl http2 default_server;
+    server_name               localhost;
+
+    add_header                Strict-Transport-Security "max-age=31536000" always;
+
+    ssl_session_cache         shared:SSL:20m;
+    ssl_session_timeout       10m;
+
+    ssl_protocols             TLSv1.2;
+    ssl_prefer_server_ciphers on;
+    ssl_ciphers               "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;";
+
+    ssl_stapling              on;
+    ssl_stapling_verify       on;
+    resolver                  8.8.8.8 8.8.4.4;
+
+    ssl_certificate           /etc/nginx/root-ca.pem;
+    ssl_certificate_key       /etc/nginx/root-ca.key;
+
+    access_log                /var/log/nginx/access_huggingface-llm.log;
+    error_log                 /var/log/nginx/error_huggingface-llm.log;
+
+    location / {
+        include            cors.conf;
+        resolver           127.0.0.11 valid=30s;
+        set                $backend "huggingface-llm:8000";
+        proxy_pass         http://$backend;
+        proxy_set_header   Host $host;
+        error_page         502 503 504 = @fallback;
+    }
+
+    location /stream/ws {
+        include            cors.conf;
+        resolver           127.0.0.11 valid=30s;
+        set                $backend "huggingface-llm:8000";
+        proxy_pass         http://$backend;
+        proxy_set_header   Upgrade $http_upgrade;
+        proxy_set_header   Connection "upgrade";
+        proxy_set_header   Host $host;
+    }
+
+    location @fallback {
+        return 503 "Service is temporarily unavailable. Please try again later.";
+    }
+}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 3c86c73b..cdcc0003 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
     "python-multipart~=0.0.7",
     "ijson~=3.1.4",
     "python-dotenv~=1.0.0",
-    "mlflow~=2.16.2",
+    "mlflow~=2.22.1",
     "psycopg2-binary~=2.9.4",
     "boto3~=1.41.1",
     "click<8.2.0",
@@ -74,6 +74,7 @@ dev = [
     "types-PyYAML~=6.0.0",
     "types-requests>=2.31.0.6",
     "openai>=1.84.0",
+    "ollama>=0.6.0",
 ]
 docs = [
     "sphinx~=7.1.2",
@@ -86,12 +87,19 @@ llm = [
     "bitsandbytes==0.49.0",
     "triton~=3.5.0; sys_platform == 'linux'",
     "kernels~=0.11.7",
+    "langchain-openai~=1.1.8; python_version >= '3.11'",
+    "langchain-core~=1.2.9; python_version >= '3.11'",
+    "langchain-nvidia-ai-endpoints~=1.0.4; python_version >= '3.11'",
+    "lm-format-enforcer~=0.11.3",
 ]
 mcp = [
     "mcp[cli]==1.26.0",
     "cms-client==0.0.1",
     "loguru~=0.7.3",
 ]
+vllm = [
+    "vllm<0.15.0",
+]
 
 # For pip versions not supporting PEP 735
 [project.optional-dependencies]
@@ -125,12 +133,19 @@ llm = [
     "bitsandbytes>=0.45.5",
     "triton~=3.5.0; sys_platform == 'linux'",
     "kernels~=0.11.7",
+    "langchain-openai~=1.1.8; python_version >= '3.11'",
+    "langchain-core~=1.2.9; python_version >= '3.11'",
+    "langchain-nvidia-ai-endpoints~=1.0.4; python_version >= '3.11'",
+    "lm-format-enforcer~=0.11.3",
 ]
 mcp = [
     "mcp[cli]==1.26.0",
     "cms-client==0.0.1",
     "loguru~=0.7.3",
 ]
+vllm = [
+    "vllm<0.15.0",
+]
 
 [tool.setuptools]
 packages = ["app"]
diff --git a/tests/app/api/test_api.py b/tests/app/api/test_api.py
index 8e935858..309acf5b 100644
--- a/tests/app/api/test_api.py
+++ b/tests/app/api/test_api.py
@@ -72,7 +72,9 @@ def test_get_generative_server():
     config.AUTH_USER_ENABLED = "true"
 
     model_service_dep = ModelServiceDep("huggingface_llm_model", config)
-    app = get_generative_server(config, model_service_dep)
+    app = get_generative_server(
+        config, msd_overwritten=model_service_dep
+    )
     info = app.openapi()["info"]
     paths = [route.path for route in app.routes]
 
@@ -82,5 +84,16 @@ def test_get_generative_server():
     assert "/info" in paths
     assert "/generate" in paths
     assert "/stream/generate" in paths
+    assert "/openai/v1/chat/completions" in paths
+    assert "/openai/v1/completions" in paths
+    assert "/openai/v1/embeddings" in paths
+    assert "/openai/v1/models" in paths
+    assert "/ollama/" in paths
+    assert "/ollama/api/tags" in paths
+    assert "/ollama/api/chat" in paths
+    assert "/ollama/api/generate" in paths
+    assert "/ollama/api/show" in paths
+    assert "/ollama/api/version" in paths
+    assert "/ollama/api/embed" in paths
     assert "/healthz" in paths
     assert "/readyz" in paths
diff --git a/tests/app/api/test_dependencies.py b/tests/app/api/test_dependencies.py
index 33541f0b..168d8ab0 100644
--- a/tests/app/api/test_dependencies.py
+++ b/tests/app/api/test_dependencies.py
@@ -1,8 +1,9 @@
 import pytest
 from fastapi import HTTPException
 
-from app.api.dependencies import ModelServiceDep, validate_tracking_id
+from app.api.dependencies import ModelManagerDep, ModelServiceDep, validate_tracking_id
 from app.config import Settings
+from app.exception import ConfigurationException
 from app.model_services.medcat_model import MedCATModel
 from app.model_services.medcat_model_icd10 import MedCATModelIcd10
 from app.model_services.medcat_model_opcs4 import MedCATModelOpcs4
@@ -70,3 +71,30 @@ def test_validate_tracking_id_invalid(run_id):
         validate_tracking_id(run_id)
     assert exc_info.value.status_code == 400
     assert "Invalid tracking ID" in exc_info.value.detail
+
+
+def test_validate_tracking_id_none_returns_none():
+    assert validate_tracking_id(None) is None
+
+
+def test_model_service_dep_returns_cached_instance():
+    model_service_dep = ModelServiceDep("medcat_snomed", Settings())
+    first_instance = model_service_dep()
+    second_instance = model_service_dep()
+    assert first_instance is second_instance
+
+
+def test_model_service_dep_raises_configuration_exception_for_unknown_type():
+    model_service_dep = ModelServiceDep("unknown_model_type", Settings())
+    with pytest.raises(ConfigurationException) as exc_info:
+        model_service_dep()
+    assert "Unknown model type" in str(exc_info.value)
+
+
+def test_model_manager_dep_assigns_model_service():
+    model_service_dep = ModelServiceDep("medcat_snomed", Settings())
+    model_service = model_service_dep()
+    manager_dep = ModelManagerDep(model_service)
+    model_manager = manager_dep()
+    assert model_manager.model_service is model_service
+    assert model_manager._model_service_type == model_service.__class__
diff --git a/tests/app/api/test_serving_common.py b/tests/app/api/test_serving_common.py
index 6818a48a..16247e76 100644
--- a/tests/app/api/test_serving_common.py
+++ b/tests/app/api/test_serving_common.py
@@ -12,6 +12,7 @@
 from app.utils import get_settings, load_pydantic_object_from_dict
 from app.model_services.medcat_model import MedCATModel
 from app.management.model_manager import ModelManager
+from tests.app.helper import disable_rate_limits
 
 config = get_settings()
 config.ENABLE_TRAINING_APIS = "true"
@@ -19,6 +20,7 @@
 config.ENABLE_EVALUATION_APIS = "true"
 config.ENABLE_PREVIEWS_APIS = "true"
 config.AUTH_USER_ENABLED = "true"
+disable_rate_limits(config)
 
 TRACKING_ID = "123e4567-e89b-12d3-a456-426614174000"
 TRAINER_EXPORT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fixture", "trainer_export.json")
diff --git a/tests/app/api/test_serving_hf_llm.py b/tests/app/api/test_serving_hf_llm.py
index 41aefd95..a8e924b1 100644
--- a/tests/app/api/test_serving_hf_llm.py
+++ b/tests/app/api/test_serving_hf_llm.py
@@ -7,7 +7,11 @@
 from fastapi.testclient import TestClient
 from app.api.api import get_generative_server
 from app.model_services.huggingface_llm_model import HuggingFaceLlmModel
-from app.utils import get_settings
+from app.domain import GenerationResult
+from app.exception import GenerationException
+from app.utils import get_settings, dump_pydantic_object_to_dict
+from tests.app.helper import disable_rate_limits
+
 
 config = get_settings()
 config.ENABLE_TRAINING_APIS = "true"
@@ -15,25 +19,45 @@
 config.ENABLE_EVALUATION_APIS = "true"
 config.ENABLE_PREVIEWS_APIS = "true"
 config.AUTH_USER_ENABLED = "false"
+disable_rate_limits(config)
 
 
 @pytest.fixture(scope="function")
 def llm_model_service():
-    yield create_autospec(HuggingFaceLlmModel)
+    model_service = create_autospec(HuggingFaceLlmModel)
+    model_service.model = Mock()
+    model_service.model.config = Mock()
+    model_service.model.config.max_position_embeddings = 2048
+    model_service.tokenizer = Mock()
+    model_service.tokenizer.chat_template = None
+    model_service.tokenizer.default_chat_template = None
+    model_service.tokenizer.encode.return_value = []
+    yield model_service
 
 @pytest.fixture(scope="function")
 def llm_app(llm_model_service):
-    app = get_generative_server(config, msd_overwritten=lambda: llm_model_service)
+    app = get_generative_server(
+        config, msd_overwritten=lambda: llm_model_service
+    )
     app.dependency_overrides[cms_globals.props.current_active_user] = lambda: None
     yield app
     app.dependency_overrides.clear()
 
+
 @pytest.fixture(scope="function")
 def client(llm_model_service):
     llm_model_service.model_name = "HuggingFace LLM model"
-    llm_model_service.generate.return_value = "Yeah."
+    llm_model_service.api_version = "0.0.0"
+    llm_model_service.digest = "sha256:9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08"
+    llm_model_service.generate.return_value = GenerationResult(
+        text="Yeah.",
+        prompt_token_num=1,
+        completion_token_num=1,
+    )
     llm_model_service.create_embeddings.return_value = [[1.0, 2.0, 3.0]]
-    app = get_generative_server(config, msd_overwritten=lambda: llm_model_service)
+    app = get_generative_server(
+        config, msd_overwritten=lambda: llm_model_service
+    )
     app.dependency_overrides[cms_globals.props.current_active_user] = lambda: None
     client = TestClient(app)
     yield client
@@ -55,7 +79,10 @@ def test_generate(client):
 
 @pytest.mark.asyncio
 async def test_stream_generate(llm_model_service, llm_app):
-    llm_model_service.generate_async.return_value = "Fine."
+    async def _gen():
+        yield "Fine."
+        yield GenerationResult(text="Fine.", prompt_token_num=1, completion_token_num=1)
+    llm_model_service.generate_async.return_value = _gen()
     async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
         response = await ac.post(
             "/stream/generate?max_tokens=32&temperature=0.7&top_p=0.9&stop_sequences=end",
@@ -66,13 +93,17 @@ async def test_stream_generate(llm_model_service, llm_app):
     assert response.status_code == 200
     assert response.headers["x-cms-tracking-id"], "x-cms-tracking-id header is missing"
     assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
-    assert response.text == "Fine."
+    assert "Fine." in response.text
 
 
 @pytest.mark.asyncio
-async def test_generate_chat_completions(llm_model_service, llm_app):
+async def test_openai_generate_chat_completions(llm_model_service, llm_app):
     llm_model_service.model_name = "HuggingFace LLM model"
-    llm_model_service.generate.return_value = "I'm a chat bot."
+    llm_model_service.generate.return_value = GenerationResult(
+        text="I'm a chat bot.",
+        prompt_token_num=1,
+        completion_token_num=1,
+    )
     request_data = {
       "messages": [
         {
@@ -86,13 +117,14 @@ async def test_generate_chat_completions(llm_model_service, llm_app):
       ],
       "model": "HuggingFace LLM model",
       "stream": False,
+      "stream_options": {"include_usage": True},
       "max_tokens": 128,
       "temperature": 0.7
     }
 
     async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
         response = await ac.post(
-            "/v1/chat/completions?max_tokens=128&temperature=0.7",
+            "/openai/v1/chat/completions?max_tokens=128&temperature=0.7",
             data=json.dumps(request_data),
             headers={"Content-Type": "application/json"},
         )
@@ -103,10 +135,131 @@ async def test_generate_chat_completions(llm_model_service, llm_app):
     assert response_json["object"] == "chat.completion"
     assert response_json["model"] == "HuggingFace LLM model"
     assert response_json["choices"][0]["message"]["content"] == "I'm a chat bot."
+    assert "usage" in response_json
+
+
+@pytest.mark.asyncio
+async def test_openai_generate_chat_completions_with_tools(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+    llm_model_service.generate.return_value = GenerationResult(
+        text="I'm a chat bot.",
+        prompt_token_num=1,
+        completion_token_num=1,
+    )
+    llm_model_service.tokenizer.chat_template = "chat template"
+    llm_model_service.tokenizer.apply_chat_template.return_value = "prompt"
+    request_data = {
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a chat bot."
+        },
+        {
+          "role": "user",
+          "content": "How is the weather in London?"
+        }
+      ],
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get weather for a city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {"type": "string"}
+              },
+              "required": ["city"]
+            }
+          }
+        }
+      ],
+      "model": "HuggingFace LLM model",
+      "stream": False,
+      "stream_options": {"include_usage": True},
+      "max_tokens": 128,
+      "temperature": 0.7
+    }
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/openai/v1/chat/completions?max_tokens=128&temperature=0.7",
+            data=json.dumps(request_data),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    llm_model_service.generate.assert_called_once()
+    assert llm_model_service.tokenizer.apply_chat_template.called
+    tools_arg = llm_model_service.tokenizer.apply_chat_template.call_args.kwargs["tools"]
+    assert tools_arg == [request_data["tools"][0]]
+
+
+@pytest.mark.asyncio
+async def test_openai_generate_chat_completions_with_response_format(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+    llm_model_service.generate.return_value = GenerationResult(
+        text="{\"age\": 28}",
+        prompt_token_num=1,
+        completion_token_num=1,
+    )
+    captured_parser = {}
+
+    def _generate(*_args, **kwargs):
+        json_schema_parser = kwargs.get("json_schema_parser")
+        if json_schema_parser is not None:
+            captured_parser["parser"] = json_schema_parser
+        return GenerationResult(
+            text="{\"age\": 28}",
+            prompt_token_num=1,
+            completion_token_num=1,
+        )
+
+    llm_model_service.generate.side_effect = _generate
+    schema = {
+        "type": "object",
+        "properties": {"age": {"type": "integer"}},
+        "required": ["age"],
+        "additionalProperties": False
+    }
+    request_data = {
+      "messages": [
+        {
+          "role": "user",
+          "content": "Extract age from the text: A 28-year-old patient."
+        }
+      ],
+      "response_format": {
+        "type": "json_schema",
+        "json_schema": {
+          "name": "person",
+          "schema": schema
+        }
+      },
+      "model": "HuggingFace LLM model",
+      "stream": False,
+      "stream_options": {"include_usage": True},
+      "max_tokens": 128,
+      "temperature": 0.7
+    }
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/openai/v1/chat/completions?max_tokens=128",
+            data=json.dumps(request_data),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    llm_model_service.generate.assert_called_once()
+    assert set(schema.keys()).issubset(
+        dump_pydantic_object_to_dict(captured_parser["parser"].context.model_class).keys()
+    )
 
 
 @pytest.mark.asyncio
-async def test_generate_chat_completions_stream(llm_model_service, llm_app):
+async def test_openai_generate_chat_completions_stream(llm_model_service, llm_app):
     llm_model_service.generate.return_value = "I'm a chat bot."
     request_data = {
       "messages": [
@@ -121,13 +274,14 @@ async def test_generate_chat_completions_stream(llm_model_service, llm_app):
       ],
       "model": "HuggingFace LLM model",
       "stream": True,
+      "stream_options": {"include_usage": True},
       "max_tokens": 128,
       "temperature": 0.7
     }
 
     async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
         response = await ac.post(
-            "/v1/chat/completions?max_tokens=128&temperature=0.7",
+            "/openai/v1/chat/completions?max_tokens=128&temperature=0.7",
             data=json.dumps(request_data),
             headers={"Content-Type": "application/json"},
         )
@@ -140,12 +294,90 @@ async def test_generate_chat_completions_stream(llm_model_service, llm_app):
 
 
 @pytest.mark.asyncio
-async def test_generate_completions(llm_model_service, llm_app):
-    llm_model_service.model_name = "HuggingFace LLM model"
+async def test_openai_generate_chat_completions_stream_without_usage(llm_model_service, llm_app):
     llm_model_service.generate.return_value = "I'm a chat bot."
+    request_data = {
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a chat bot."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+      ],
+      "model": "HuggingFace LLM model",
+      "stream": True,
+      "stream_options": {"include_usage": False},
+      "max_tokens": 128,
+      "temperature": 0.7
+    }
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/openai/v1/chat/completions?max_tokens=128&temperature=0.7",
+            data=json.dumps(request_data),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
+    assert '"usage"' not in response.text
+
+
+@pytest.mark.asyncio
+async def test_openai_generate_chat_completions_stream_with_generation_exception(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+
+    async def _failing_async_gen():
+        raise GenerationException("stream failed")
+        yield ""
+
+    llm_model_service.generate_async = Mock(return_value=_failing_async_gen())
+    request_data = {
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a chat bot."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+      ],
+      "model": "HuggingFace LLM model",
+      "stream": True,
+      "stream_options": {"include_usage": True},
+      "max_tokens": 128,
+      "temperature": 0.7
+    }
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/openai/v1/chat/completions?max_tokens=128&temperature=0.7",
+            data=json.dumps(request_data),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
+    assert '"type": "generation_error"' in response.text
+    assert "[DONE]" in response.text
+
+
+@pytest.mark.asyncio
+async def test_openai_generate_completions(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+    llm_model_service.generate.return_value = GenerationResult(
+        text="I'm a chat bot.",
+        prompt_token_num=1,
+        completion_token_num=1,
+    )
     request_data = {
         "model": "HuggingFace LLM model",
         "prompt": "Who are you?",
+        "stream_options": {"include_usage": True},
         "max_tokens": 128,
         "temperature": 0.7,
         "stream": False,
@@ -153,7 +385,7 @@ async def test_generate_completions(llm_model_service, llm_app):
 
     async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
         response = await ac.post(
-            "/v1/completions",
+            "/openai/v1/completions",
             data=json.dumps(request_data),
             headers={"Content-Type": "application/json"},
         )
@@ -167,7 +399,7 @@ async def test_generate_completions(llm_model_service, llm_app):
 
 
 @pytest.mark.asyncio
-async def test_generate_completions_stream(llm_model_service, llm_app):
+async def test_openai_generate_completions_stream(llm_model_service, llm_app):
     llm_model_service.model_name = "HuggingFace LLM model"
 
     async def async_gen():
@@ -177,13 +409,14 @@ async def async_gen():
     request_data = {
         "model": "HuggingFace LLM model",
         "prompt": "Who are you?",
+        "stream_options": {"include_usage": True},
         "max_tokens": 128,
         "temperature": 0.7,
         "stream": True,
     }
     async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
         response = await ac.post(
-            "/v1/completions",
+            "/openai/v1/completions",
             data=json.dumps(request_data),
             headers={"Content-Type": "application/json"},
         )
@@ -196,13 +429,41 @@ async def async_gen():
     assert "[DONE]" in response.text
 
 
-def test_create_embeddings(client):
+@pytest.mark.asyncio
+async def test_openai_generate_completions_stream_with_no_usage(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+
+    async def async_gen():
+        yield "I'm a chat bot."
+
+    llm_model_service.generate_async.return_value = async_gen()
+    request_data = {
+        "model": "HuggingFace LLM model",
+        "prompt": "Who are you?",
+        "stream_options": {"include_usage": False},
+        "max_tokens": 128,
+        "temperature": 0.7,
+        "stream": True,
+    }
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/openai/v1/completions",
+            data=json.dumps(request_data),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
+    assert '"usage"' not in response.text
+
+
+def test_openai_create_embeddings(client):
     request_data = {
         "input": ["Alright"],
         "model": "HuggingFace LLM model",
     }
     response = client.post(
-        "/v1/embeddings",
+        "/openai/v1/embeddings",
         data=json.dumps(request_data),
         headers={"Content-Type": "application/json"},
     )
@@ -215,8 +476,8 @@ def test_create_embeddings(client):
     }
 
 
-def test_list_models(client):
-    response = client.get("/v1/models")
+def test_openai_list_models(client):
+    response = client.get("/openai/v1/models")
 
     assert response.status_code == 200
     assert response.headers["content-type"] == "application/json"
@@ -229,8 +490,8 @@ def test_list_models(client):
     assert response_json["data"][0]["owned_by"] == "cms"
 
 
-def test_get_model(client):
-    response = client.get("/v1/models/HuggingFace_LLM_model")
+def test_openai_get_model(client):
+    response = client.get("/openai/v1/models/HuggingFace_LLM_model")
 
     assert response.status_code == 200
     assert response.headers["content-type"] == "application/json"
@@ -242,3 +503,231 @@ def test_get_model(client):
     assert response_json["permission"] == []
     assert response_json["root"] == "HuggingFace_LLM_model"
     assert response_json["parent"] is None
+
+
+def test_ollama_health(client):
+    get_response = client.get("/ollama")
+    head_response = client.head("/ollama")
+
+    assert get_response.status_code == 200
+    assert get_response.json() == {"status": "ok"}
+    assert head_response.status_code == 200
+
+
+def test_ollama_version(client):
+    response = client.get("/ollama/api/version")
+    assert response.status_code == 200
+    assert "version" in response.json()
+
+
+def test_ollama_tags(client, llm_model_service):
+    llm_model_service.model = Mock()
+    llm_model_service.model.config = Mock()
+    llm_model_service.model.config.model_type = "model_type"
+    llm_model_service.tokenizer = Mock(chat_template="chat template")
+    llm_model_service.info.return_value = Mock(model_card={"model_type": "model_type"})
+
+    tags_response = client.get("/ollama/api/tags")
+
+    assert tags_response.status_code == 200
+    assert tags_response.json()["models"][0]["name"] == "HuggingFace_LLM_model"
+
+
+def test_ollama_show(client, llm_model_service):
+    llm_model_service.model_name = "HuggingFace LLM model"
+    llm_model_service.model = Mock()
+    llm_model_service.model.config = Mock()
+    llm_model_service.model.config.model_type = "model_type"
+    llm_model_service.tokenizer = Mock(chat_template="chat template")
+    llm_model_service.info.return_value = Mock(model_card={"model_type": "model_type"})
+
+    response = client.post(
+        "/ollama/api/show",
+        data=json.dumps({"model": "HuggingFace_LLM_model"}),
+        headers={"Content-Type": "application/json"},
+    )
+
+    response_json = response.json()
+    assert response.status_code == 200
+    assert response_json["model_info"]["model_type"] == "model_type"
+    assert response_json["modelfile"] == "HuggingFace LLM model"
+    assert response_json["template"] == "chat template"
+    assert response_json["details"]["family"] == "model_type"
+    assert response_json["capabilities"] == ["completion", "chat", "create_embeddings"]
+
+
+def test_ollama_generate(client):
+    response = client.post(
+        "/ollama/api/generate",
+        data=json.dumps({
+            "model": "HuggingFace_LLM_model",
+            "prompt": "Hello",
+            "stream": False,
+            "options": {"num_predict": 16, "temperature": 0.2, "top_p": 0.8},
+        }),
+        headers={"Content-Type": "application/json"},
+    )
+
+    assert response.status_code == 200
+    response_json = response.json()
+    assert response_json["model"] == "HuggingFace_LLM_model"
+    assert response_json["response"] == "Yeah."
+    assert response_json["done"] is True
+
+
+def test_ollama_generate_with_format(client, llm_model_service):
+    captured_parser = {}
+
+    def _generate(*_args, **kwargs):
+        json_schema_parser = kwargs.get("json_schema_parser")
+        if json_schema_parser is not None:
+            captured_parser["parser"] = json_schema_parser
+        return GenerationResult(
+            text="{\"age\": 28}",
+            prompt_token_num=1,
+            completion_token_num=1,
+        )
+
+    llm_model_service.generate.side_effect = _generate
+    format = {
+        "type": "object",
+        "properties": {"age": {"type": "integer"}},
+        "required": ["age"],
+        "additionalProperties": False,
+    }
+
+    response = client.post(
+        "/ollama/api/generate",
+        data=json.dumps({
+            "model": "HuggingFace_LLM_model",
+            "prompt": "Extract age",
+            "stream": False,
+            "format": format,
+        }),
+        headers={"Content-Type": "application/json"},
+    )
+
+    assert response.status_code == 200
+    assert set(format.keys()).issubset(
+        dump_pydantic_object_to_dict(captured_parser["parser"].context.model_class).keys()
+    )
+
+
+@pytest.mark.asyncio
+async def test_ollama_generate_stream(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+
+    async def async_gen():
+        yield "hi "
+        yield "there"
+    llm_model_service.generate_async = Mock(return_value=async_gen())
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/ollama/api/generate",
+            data=json.dumps({
+                "model": "HuggingFace_LLM_model",
+                "prompt": "Hello",
+                "stream": True,
+            }),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "application/x-ndjson"
+    assert '"done": false' in response.text.lower()
+    assert '"done": true' in response.text.lower()
+
+
+@pytest.mark.asyncio
+async def test_ollama_generate_stream_with_generation_exception(llm_model_service, llm_app):
+    llm_model_service.model_name = "HuggingFace LLM model"
+
+    async def _failing_async_gen():
+        raise GenerationException("stream failed")
+        yield ""
+
+    llm_model_service.generate_async = Mock(return_value=_failing_async_gen())
+
+    async with httpx.AsyncClient(app=llm_app, base_url="http://test") as ac:
+        response = await ac.post(
+            "/ollama/api/generate",
+            data=json.dumps({
+                "model": "HuggingFace_LLM_model",
+                "prompt": "Hello",
+                "stream": True,
+            }),
+            headers={"Content-Type": "application/json"},
+        )
+
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "application/x-ndjson"
+    assert '"done_reason": "error"' in response.text
+    assert '"error": "stream failed"' in response.text
+
+
+def test_ollama_chat(client):
+    chat_response = client.post(
+        "/ollama/api/chat",
+        data=json.dumps({
+            "model": "HuggingFace_LLM_model",
+            "messages": [{"role": "user", "content": "Hi"}],
+            "stream": False,
+        }),
+        headers={"Content-Type": "application/json"},
+    )
+
+    assert chat_response.status_code == 200
+    assert chat_response.json()["message"]["role"] == "assistant"
+
+
+def test_ollama_chat_with_format(client, llm_model_service):
+    captured_parser = {}
+
+    def _generate(*_args, **kwargs):
+        json_schema_parser = kwargs.get("json_schema_parser")
+        if json_schema_parser is not None:
+            captured_parser["parser"] = json_schema_parser
+        return GenerationResult(
+            text="{\"age\": 28}",
+            prompt_token_num=1,
+            completion_token_num=1,
+        )
+
+    llm_model_service.generate.side_effect = _generate
+    format = {
+        "type": "object",
+        "properties": {"age": {"type": "integer"}},
+        "required": ["age"],
+        "additionalProperties": False,
+    }
+
+    chat_response = client.post(
+        "/ollama/api/chat",
+        data=json.dumps({
+            "model": "HuggingFace_LLM_model",
+            "messages": [{"role": "user", "content": "Extract age"}],
+            "stream": False,
+            "format": format,
+        }),
+        headers={"Content-Type": "application/json"},
+    )
+
+    assert chat_response.status_code == 200
+    assert set(format.keys()).issubset(
+        dump_pydantic_object_to_dict(captured_parser["parser"].context.model_class).keys()
+    )
+
+
+def test_ollama_embed(client):
+    response = client.post(
+        "/ollama/api/embed",
+        data=json.dumps({
+            "model": "HuggingFace_LLM_model",
+            "input": ["test", "blah"],
+        }),
+        headers={"Content-Type": "application/json"},
+    )
+
+    assert response.status_code == 200
+    assert "embeddings" in response.json()
diff --git a/tests/app/api/test_serving_hf_ner.py b/tests/app/api/test_serving_hf_ner.py
index 91a76da6..73e6c421 100644
--- a/tests/app/api/test_serving_hf_ner.py
+++ b/tests/app/api/test_serving_hf_ner.py
@@ -7,6 +7,7 @@
 from app.utils import get_settings, load_pydantic_object_from_dict
 from app.model_services.huggingface_ner_model import HuggingFaceNerModel
 from app.domain import ModelCard, ModelType
+from tests.app.helper import disable_rate_limits
 
 config = get_settings()
 config.ENABLE_TRAINING_APIS = "true"
@@ -14,6 +15,7 @@
 config.ENABLE_EVALUATION_APIS = "true"
 config.ENABLE_PREVIEWS_APIS = "true"
 config.AUTH_USER_ENABLED = "true"
+disable_rate_limits(config)
 
 TRAINER_EXPORT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fixture", "trainer_export.json")
 NOTE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "fixture", "note.txt")
diff --git a/tests/app/api/test_serving_stream.py b/tests/app/api/test_serving_stream.py
index a8c2c5ac..10d16e7e 100644
--- a/tests/app/api/test_serving_stream.py
+++ b/tests/app/api/test_serving_stream.py
@@ -1,3 +1,4 @@
+import asyncio
 import httpx
 import json
 import pytest
@@ -6,11 +7,13 @@
 
 from fastapi.testclient import TestClient
 from starlette.websockets import WebSocketDisconnect
-from unittest.mock import create_autospec
+from unittest.mock import create_autospec, AsyncMock
 from app.api.api import get_stream_server
 from app.utils import get_settings
 from app.model_services.medcat_model import MedCATModel
 from app.management.model_manager import ModelManager
+from app.domain import Annotation
+from tests.app.helper import disable_rate_limits
 
 
 @pytest.fixture(scope="function")
@@ -27,6 +30,7 @@ def ner_app(ner_model_service):
     config.ENABLE_PREVIEWS_APIS = "true"
     config.AUTH_USER_ENABLED = "false"
     config.AUTH_USER_ENABLED = "false"
+    disable_rate_limits(config)
     app = get_stream_server(config, msd_overwritten=lambda: ner_model_service)
     app.dependency_overrides[cms_globals.props.current_active_user] = lambda: None
     yield app
@@ -89,6 +93,17 @@ async def test_stream_process_unknown_jsonl_property(ner_app):
     assert "Invalid JSON properties found" in json.loads(jsonlines.decode("utf-8").splitlines()[-1])["error"]
 
 
+def test_websocket_info(ner_app):
+    with TestClient(ner_app) as client:
+        response = client.get("/stream/ws", headers={"x-forwarded-prefix": "/cms/medcat"})
+
+    assert response.status_code == 200
+    response_json = response.json()
+    assert response_json["protocol"] == "WebSocket"
+    assert "message" in response_json
+    assert "example" in response_json
+
+
 def test_websocket_process_on_annotation_error(ner_model_service, ner_app):
     ner_model_service.annotate_async.side_effect = Exception("something went wrong")
     model_manager = ModelManager(None, None)
@@ -100,3 +115,80 @@ def test_websocket_process_on_annotation_error(ner_model_service, ner_app):
             websocket.send_text("Spinal stenosis")
             response = websocket.receive_text()
             assert response == "ERROR: something went wrong"
+
+
+@pytest.mark.skip()
+@pytest.mark.asyncio
+async def test_sse_process(ner_app, ner_model_service):
+    fake_annotation = Annotation(
+        label_id="C123456",
+        label_name="test",
+        start=0,
+        end=4,
+        confidence=0.9,
+        doc_name="0",
+    )
+    ner_model_service.annotate_async = AsyncMock(return_value=[fake_annotation])
+
+    async with (
+        httpx.AsyncClient(app=ner_app, base_url="http://test", timeout=30.0) as sse_client,
+        httpx.AsyncClient(app=ner_app, base_url="http://test", timeout=30.0) as post_client,
+    ):
+        sse_data = []
+        connection_established = asyncio.Event()
+
+        async def consume_sse():
+            async with sse_client.stream(
+                "GET", "/stream/sse/events?client_id=abc"
+            ) as response:
+                assert response.status_code == 200
+                assert "text/event-stream" in response.headers["content-type"]
+
+                async for line in response.aiter_lines():
+                    if not line:
+                        continue
+
+                    if line.startswith(": connected"):
+                        connection_established.set()
+                        continue
+
+                    if line.startswith(":"):
+                        continue
+
+                    if line.startswith("data: "):
+                        payload = json.loads(line[6:])
+                        sse_data.append(payload)
+
+                        if payload.get("status") == "all_completed":
+                            break
+
+        async def send_data():
+            await asyncio.wait_for(connection_established.wait(), timeout=5)
+
+            response = await post_client.post(
+                "/stream/sse/process?client_id=abc",
+                content='{"text": "This is a test"}\n',
+                headers={"Content-Type": "application/x-ndjson"},
+            )
+
+            assert response.status_code == 202
+
+        await asyncio.wait_for(
+            asyncio.gather(consume_sse(), send_data()),
+            timeout=5.0,
+        )
+
+        assert len(sse_data) > 0, "No SSE events received"
+
+        statuses = [event.get("status") for event in sse_data if "status" in event]
+        assert "started" in statuses
+        assert "completed" in statuses
+        assert "all_completed" in statuses
+
+        annotations = [
+            event["data"] for event in sse_data if event.get("type") == "annotation"
+        ]
+
+        assert len(annotations) == 1
+        assert annotations[0]["label_id"] == "C123456"
+        assert annotations[0]["label_name"] == "test"
diff --git a/tests/app/api/test_serving_trf.py b/tests/app/api/test_serving_trf.py
index a5244178..b7bac6d2 100644
--- a/tests/app/api/test_serving_trf.py
+++ b/tests/app/api/test_serving_trf.py
@@ -7,10 +7,12 @@
 from app.utils import get_settings, load_pydantic_object_from_dict
 from app.model_services.trf_model_deid import TransformersModelDeIdentification
 from app.domain import ModelCard, ModelType, Annotation
+from tests.app.helper import disable_rate_limits
 
 
 config = get_settings()
 config.AUTH_USER_ENABLED = "true"
+disable_rate_limits(config)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/app/api/test_utils.py b/tests/app/api/test_utils.py
index 264b9ca8..f635bc22 100644
--- a/tests/app/api/test_utils.py
+++ b/tests/app/api/test_utils.py
@@ -1,4 +1,9 @@
-from fastapi import FastAPI
+import pytest
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from fastapi.testclient import TestClient
+from app import __version__ as app_version
+from app.domain import ModelType
 from app.utils import get_settings
 from app.api.utils import (
     add_exception_handlers,
@@ -6,7 +11,13 @@
     get_rate_limiter,
     encrypt,
     decrypt,
+    ForwardedPrefixMiddleware,
+    init_vllm_engine,
+    init_sglang_engine,
 )
+import sys
+import types
+import contextlib
 
 
 def test_add_exception_handlers():
@@ -91,3 +102,282 @@ def test_decrypt():
     encrypted = "TLlMBh4GDf3BSsO/RKlqG5H7Sxv7OXGbl8qE/6YLQPm3coBbnrRRReX7pLamnjLPUU0PtIRIg2H/hWBWE/3cRtXDPT7jMtmGHMIPO/95A0DkrndIkOeQ29J6TBPBBG6YqBNRb2dyhDBwDIEDjPTiRe68sYz4KkxzSOkcz31314kSkZvdIDtQOgeRDa0/7U0VrJePL2N7SJvEiHf4Xa3vW3/20S3O8s/Yp0Azb/kS9dFa54VO1fNNhJ46OtPpdekiFDR5yvQfHwFVeSDdY+eAuYLTWa6bz/LrQkRAdRi9EW5Iz/q8WgKhZXQJfcXtiKfVuFar2N2KodY7C/45vMOfvw=="
     decrypted = decrypt(encrypted, fake_private_key_pem)
     assert decrypted == "test"
+
+
+def test_forwarded_prefix_middleware():
+    app = FastAPI()
+    app.add_middleware(ForwardedPrefixMiddleware)
+
+    @app.get("/ping")
+    async def ping(request: Request) -> dict:
+        return {"root_path": request.scope.get("root_path")}
+
+    with TestClient(app) as client:
+        response = client.get("/ping", headers={"x-forwarded-prefix": "/cms/huggingface-llm"})
+
+    assert response.status_code == 200
+    assert response.json() == {"root_path": "/cms/huggingface-llm"}
+
+
+@pytest.mark.asyncio
+async def test_init_vllm_engine(monkeypatch):
+    fake_vllm = types.ModuleType("vllm")
+    fake_utils = types.ModuleType("vllm.utils")
+    fake_argparse_utils = types.ModuleType("vllm.utils.argparse_utils")
+    fake_engine = types.ModuleType("vllm.engine")
+    fake_engine_arg_utils = types.ModuleType("vllm.engine.arg_utils")
+    fake_entrypoints = types.ModuleType("vllm.entrypoints")
+    fake_openai = types.ModuleType("vllm.entrypoints.openai")
+    fake_cli_args = types.ModuleType("vllm.entrypoints.openai.cli_args")
+    fake_chat_utils = types.ModuleType("vllm.entrypoints.chat_utils")
+    fake_api_server = types.ModuleType("vllm.entrypoints.openai.api_server")
+
+    class FlexibleArgumentParser:
+        def parse_args(self, _):
+            return types.SimpleNamespace()
+
+    def make_arg_parser(parser):
+        return parser
+
+    def validate_parsed_serve_args(args):
+        return args
+
+    class AsyncEngineArgs:
+        @classmethod
+        def from_cli_args(cls, _args):
+            return cls()
+
+    async def parse_chat_messages(messages, model_config, content_format="string"):
+        return messages, None, None
+
+    def apply_hf_chat_template(*_args, **_kwargs):
+        return "prompt"
+
+    async def show_available_models(_request):
+        return JSONResponse(content={"data": []})
+
+    async def show_version():
+        return JSONResponse(content={"version": "test"})
+
+    async def create_chat_completion(_request, _raw_request):
+        return JSONResponse(content={"choices": []})
+
+    async def create_completion(_request, _raw_request):
+        return JSONResponse(content={"choices": []})
+
+    async def create_responses(_request, _raw_request):
+        return JSONResponse(content={"output": []})
+
+    async def retrieve_responses(_response_id, _raw_request):
+        return JSONResponse(content={"output": []})
+
+    async def create_messages(_request, _raw_request):
+        return JSONResponse(content={"messages": []})
+
+    async def create_transcriptions(_request, _raw_request):
+        return JSONResponse(content={"text": ""})
+
+    async def create_translations(_request, _raw_request):
+        return JSONResponse(content={"text": ""})
+
+    @contextlib.asynccontextmanager
+    async def build_async_engine_client_from_engine_args(*_args, **_kwargs):
+        class _Engine:
+            vllm_config = object()
+            model_config = types.SimpleNamespace(to_dict=lambda: {"name_or_path": "model_path"})
+
+            async def get_tokenizer(self):
+                class _Tokenizer:
+                    chat_template = None
+                    default_chat_template = None
+
+                    def __call__(self, text, add_special_tokens=False):
+                        return types.SimpleNamespace(input_ids=[1, 2, 3])
+
+                return _Tokenizer()
+
+            async def get_model_config(self):
+                return self.model_config
+
+        yield _Engine()
+
+    async def init_app_state(_engine, _state, _args):
+        return None
+
+    class SamplingParams:
+        def __init__(self, max_tokens=0):
+            self.max_tokens = max_tokens
+
+    class TokensPrompt(dict):
+        def __init__(self, prompt_token_ids=None):
+            super().__init__(prompt_token_ids=prompt_token_ids)
+
+    fake_argparse_utils.FlexibleArgumentParser = FlexibleArgumentParser
+    fake_cli_args.make_arg_parser = make_arg_parser
+    fake_cli_args.validate_parsed_serve_args = validate_parsed_serve_args
+    fake_engine_arg_utils.AsyncEngineArgs = AsyncEngineArgs
+    fake_chat_utils.parse_chat_messages = parse_chat_messages
+    fake_chat_utils.apply_hf_chat_template = apply_hf_chat_template
+    fake_api_server.create_chat_completion = create_chat_completion
+    fake_api_server.create_completion = create_completion
+    fake_api_server.create_responses = create_responses
+    fake_api_server.retrieve_responses = retrieve_responses
+    fake_api_server.create_messages = create_messages
+    fake_api_server.create_transcriptions = create_transcriptions
+    fake_api_server.create_translations = create_translations
+    fake_api_server.show_available_models = show_available_models
+    fake_api_server.show_version = show_version
+    fake_api_server.build_async_engine_client_from_engine_args = build_async_engine_client_from_engine_args
+    fake_api_server.init_app_state = init_app_state
+    fake_vllm.SamplingParams = SamplingParams
+    fake_vllm.TokensPrompt = TokensPrompt
+
+    monkeypatch.setitem(sys.modules, "vllm", fake_vllm)
+    monkeypatch.setitem(sys.modules, "vllm.utils", fake_utils)
+    monkeypatch.setitem(sys.modules, "vllm.utils.argparse_utils", fake_argparse_utils)
+    monkeypatch.setitem(sys.modules, "vllm.engine", fake_engine)
+    monkeypatch.setitem(sys.modules, "vllm.engine.arg_utils", fake_engine_arg_utils)
+    monkeypatch.setitem(sys.modules, "vllm.entrypoints", fake_entrypoints)
+    monkeypatch.setitem(sys.modules, "vllm.entrypoints.openai", fake_openai)
+    monkeypatch.setitem(sys.modules, "vllm.entrypoints.openai.cli_args", fake_cli_args)
+    monkeypatch.setitem(sys.modules, "vllm.entrypoints.chat_utils", fake_chat_utils)
+    monkeypatch.setitem(sys.modules, "vllm.entrypoints.openai.api_server", fake_api_server)
+
+    app = FastAPI()
+    app = await init_vllm_engine(app, get_settings(), "model_path", "model_name", "info")
+    paths = {route.path for route in app.router.routes}
+    assert "/info" in paths
+    assert "/generate" in paths
+    assert "/v1/chat/completions" in paths
+    assert "/v1/completions" in paths
+    assert "/v1/models" in paths
+    assert "/v1/version" in paths
+    with TestClient(app) as client:
+        response = client.get("/info")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["api_version"] == app_version
+    assert data["model_description"] == "model_name"
+    assert data["model_type"] == ModelType.HUGGINGFACE_LLM.value
+    assert data["model_card"] == {"name_or_path": "model_path"}
+
+
+@pytest.mark.asyncio
+async def test_init_sglang_engine(monkeypatch):
+    fake_sglang = types.ModuleType("sglang")
+    fake_srt = types.ModuleType("sglang.srt")
+    fake_entrypoints = types.ModuleType("sglang.srt.entrypoints")
+    fake_engine_mod = types.ModuleType("sglang.srt.entrypoints.engine")
+    fake_http_server = types.ModuleType("sglang.srt.entrypoints.http_server")
+    fake_server_args_mod = types.ModuleType("sglang.srt.server_args")
+    fake_openai_mod = types.ModuleType("sglang.srt.entrypoints.openai")
+    fake_serving_chat = types.ModuleType("sglang.srt.entrypoints.openai.serving_chat")
+    fake_serving_completions = types.ModuleType("sglang.srt.entrypoints.openai.serving_completions")
+    fake_serving_embedding = types.ModuleType("sglang.srt.entrypoints.openai.serving_embedding")
+    fake_serving_score = types.ModuleType("sglang.srt.entrypoints.openai.serving_score")
+    fake_serving_rerank = types.ModuleType("sglang.srt.entrypoints.openai.serving_rerank")
+    fake_metrics = types.ModuleType("sglang.srt.metrics")
+    fake_func_timer = types.ModuleType("sglang.srt.metrics.func_timer")
+    fake_version = types.ModuleType("sglang.version")
+    fake_utils_mod = types.ModuleType("sglang.srt.utils")
+
+    class _ServerArgs:
+        model_path = "model_path"
+        served_model_name = "model_name"
+        log_level = "info"
+        log_level_http = "info"
+        tokenizer_worker_num = 1
+        skip_server_warmup = False
+        quantization = None
+        model_impl = "transformers"
+        mem_fraction_static = 0.9
+        api_key = None
+        enable_metrics = False
+
+    def prepare_server_args(_argv):
+        return _ServerArgs()
+
+    class _TokenizerManager:
+        _subprocess_watchdog = None
+        model_config = types.SimpleNamespace(to_dict=lambda: {"name_or_path": "model_path"})
+
+    class _TemplateManager:
+        pass
+
+    def _launch_subprocesses(server_args, init_tokenizer_manager_func, run_scheduler_process_func, run_detokenizer_process_func):
+        return _TokenizerManager(), _TemplateManager(), [object()], object()
+
+    def init_tokenizer_manager(*_a, **_kw): pass
+    def run_detokenizer_process(*_a, **_kw): pass
+    def run_scheduler_process(*_a, **_kw): pass
+
+    class _GlobalState:
+        def __init__(self, tokenizer_manager, template_manager, scheduler_info): pass
+
+    def set_global_state(_gs): pass
+
+    async def generate_request(_request): return JSONResponse(content={})
+    async def openai_v1_chat_completions(_request): return JSONResponse(content={})
+    async def openai_v1_completions(_request): return JSONResponse(content={})
+    async def available_models(_request): return JSONResponse(content={"data": []})
+    async def validate_json_request(_request): pass
+
+    fake_engine_mod._launch_subprocesses = _launch_subprocesses
+    fake_engine_mod.init_tokenizer_manager = init_tokenizer_manager
+    fake_engine_mod.run_detokenizer_process = run_detokenizer_process
+    fake_engine_mod.run_scheduler_process = run_scheduler_process
+    fake_server_args_mod.prepare_server_args = prepare_server_args
+    fake_http_server._GlobalState = _GlobalState
+    fake_http_server.set_global_state = set_global_state
+    fake_http_server.generate_request = generate_request
+    fake_http_server.openai_v1_chat_completions = openai_v1_chat_completions
+    fake_http_server.openai_v1_completions = openai_v1_completions
+    fake_http_server.available_models = available_models
+    fake_http_server.validate_json_request = validate_json_request
+    fake_serving_chat.OpenAIServingChat = lambda *a, **kw: object()
+    fake_serving_completions.OpenAIServingCompletion = lambda *a, **kw: object()
+    fake_serving_embedding.OpenAIServingEmbedding = lambda *a, **kw: object()
+    fake_serving_score.OpenAIServingScore = lambda *a, **kw: object()
+    fake_serving_rerank.OpenAIServingRerank = lambda *a, **kw: object()
+    fake_func_timer.enable_func_timer = lambda: None
+    fake_version.__version__ = "0.0.0"
+    fake_utils_mod.add_api_key_middleware = None
+    fake_utils_mod.add_prometheus_middleware = None
+
+    for name, mod in [
+        ("sglang", fake_sglang),
+        ("sglang.srt", fake_srt),
+        ("sglang.srt.entrypoints", fake_entrypoints),
+        ("sglang.srt.entrypoints.engine", fake_engine_mod),
+        ("sglang.srt.entrypoints.http_server", fake_http_server),
+        ("sglang.srt.server_args", fake_server_args_mod),
+        ("sglang.srt.entrypoints.openai", fake_openai_mod),
+        ("sglang.srt.entrypoints.openai.serving_chat", fake_serving_chat),
+        ("sglang.srt.entrypoints.openai.serving_completions", fake_serving_completions),
+        ("sglang.srt.entrypoints.openai.serving_embedding", fake_serving_embedding),
+        ("sglang.srt.entrypoints.openai.serving_score", fake_serving_score),
+        ("sglang.srt.entrypoints.openai.serving_rerank", fake_serving_rerank),
+        ("sglang.srt.metrics", fake_metrics),
+        ("sglang.srt.metrics.func_timer", fake_func_timer),
+        ("sglang.version", fake_version),
+        ("sglang.srt.utils", fake_utils_mod),
+    ]:
+        monkeypatch.setitem(sys.modules, name, mod)
+
+    app = FastAPI()
+    app = await init_sglang_engine(app, get_settings(), "model_path", "model_name", "info")
+    paths = {route.path for route in app.router.routes}
+    assert "/info" in paths
+    assert "/generate" in paths
+    assert "/v1/chat/completions" in paths
+    assert "/v1/completions" in paths
+    assert "/v1/models" in paths
+    assert "/v1/version" in paths
+    with TestClient(app) as client:
+        response = client.get("/info")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["api_version"] == app_version
+    assert data["model_description"] == "model_name"
+    assert data["model_type"] == ModelType.HUGGINGFACE_LLM.value
+    assert data["model_card"] == {"name_or_path": "model_path"}
diff --git a/tests/app/cli/test_cli.py b/tests/app/cli/test_cli.py
index e7605903..3715668a 100644
--- a/tests/app/cli/test_cli.py
+++ b/tests/app/cli/test_cli.py
@@ -1,4 +1,6 @@
 import os
+import unittest
+
 import pytest
 from typer.testing import CliRunner
 from unittest.mock import patch
@@ -101,7 +103,6 @@ def test_generate_api_doc_help():
     assert result.exit_code == 0
     assert "This generates an API document for all endpoints defined in CMS" in result.output
 
-
 def test_generate_api_doc():
     result = runner.invoke(cmd_app, ["export-openapi-spec", "--api-title", "TestAPIs"])
     assert result.exit_code == 0
diff --git a/tests/app/conftest.py b/tests/app/conftest.py
index ff3ba3a7..2f40f12a 100644
--- a/tests/app/conftest.py
+++ b/tests/app/conftest.py
@@ -50,7 +50,9 @@ def medcat_snomed_model():
     config = Settings()
     config.BASE_MODEL_FILE = "snomed_model.zip"
     config.TYPE_UNIQUE_ID_WHITELIST = "91776366,81102976,28321150,67667581,9090192,27603525"
-    return MedCATModelSnomed(config, MODEL_PARENT_DIR, True)
+    model_service = MedCATModelSnomed(config, MODEL_PARENT_DIR, True)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
@@ -58,7 +60,9 @@ def medcat_icd10_model():
     config = Settings()
     config.BASE_MODEL_FILE = "icd10_model.zip"
     config.TYPE_UNIQUE_ID_WHITELIST = "91776366,81102976,28321150,67667581,9090192,27603525"
-    return MedCATModelIcd10(config, MODEL_PARENT_DIR, True)
+    model_service = MedCATModelIcd10(config, MODEL_PARENT_DIR, True)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
@@ -66,14 +70,18 @@ def medcat_opcs4_model():
     config = Settings()
     config.BASE_MODEL_FILE = "opcs4_model.zip"
     config.TYPE_UNIQUE_ID_WHITELIST = "T-9,T-11,T-18,T-39,T-40,T-45"
-    return MedCATModelOpcs4(config, MODEL_PARENT_DIR, True)
+    model_service = MedCATModelOpcs4(config, MODEL_PARENT_DIR, True)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
 def medcat_umls_model():
     config = Settings()
     config.BASE_MODEL_FILE = "umls_model.zip"
-    return MedCATModelUmls(config, MODEL_PARENT_DIR, True)
+    model_service = MedCATModelUmls(config, MODEL_PARENT_DIR, True)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
@@ -81,14 +89,18 @@ def medcat_deid_model():
     config = Settings()
     config.BASE_MODEL_FILE = "deid_model.zip"
     config.INCLUDE_SPAN_TEXT = "true"
-    return MedCATModelDeIdentification(config, MODEL_PARENT_DIR, True)
+    model_service = MedCATModelDeIdentification(config, MODEL_PARENT_DIR, True)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
 def trf_model():
     config = Settings()
     config.BASE_MODEL_FILE = "trf_deid_model.zip"
-    return TransformersModelDeIdentification(config, MODEL_PARENT_DIR)
+    model_service = TransformersModelDeIdentification(config, MODEL_PARENT_DIR)
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
@@ -98,15 +110,16 @@ def huggingface_ner_model():
     config.INCLUDE_SPAN_TEXT = "true"
     model_service = HuggingFaceNerModel(config, MODEL_PARENT_DIR)
     model_service.init_model()
-    return model_service
+    yield model_service
+    model_service.shutdown()
 
 
 @pytest.fixture(scope="function")
 def huggingface_llm_model():
     config = Settings()
     config.BASE_MODEL_FILE = "huggingface_llm_model.tar.gz"
-    config.TRAINING_HF_TAGGING_SCHEME = "flat"
+    config.TRAINING_HF_NER_TAGGING_SCHEME = "flat"
+    config.OVERRIDE_CHAT_TEMPLATE = "{{ custom_template }}"
     model_service = HuggingFaceLlmModel(config, MODEL_PARENT_DIR)
-    model_service.init_model()
     yield model_service
-    model_service.close()
+    model_service.shutdown()
diff --git a/tests/app/helper.py b/tests/app/helper.py
index e419f07b..ae55df1b 100644
--- a/tests/app/helper.py
+++ b/tests/app/helper.py
@@ -1,5 +1,6 @@
 import time
 import mlflow
+from app.config import Settings
 
 
 def ensure_no_active_run(timeout_seconds: int = 600) -> None:
@@ -17,6 +18,12 @@ def ensure_no_active_run(timeout_seconds: int = 600) -> None:
             time.sleep(1)
 
 
+def disable_rate_limits(config: Settings):
+    config.PROCESS_RATE_LIMIT = ""
+    config.PROCESS_BULK_RATE_LIMIT = ""
+    config.GENERATION_RATE_LIMIT = ""
+
+
 class StringContains(str):
     def __eq__(self, other):
         return self in other
diff --git a/tests/app/mcp/oauth/test_middleware.py b/tests/app/mcp/oauth/test_middleware.py
index dde58604..36f62f8a 100644
--- a/tests/app/mcp/oauth/test_middleware.py
+++ b/tests/app/mcp/oauth/test_middleware.py
@@ -18,6 +18,7 @@ def oauth_token():
         expires_in=3600,
         refresh_token="test_refresh_token",
     )
+
 @pytest.fixture
 def mock_app():
     async def mock_scope(scope, receive, send):
@@ -79,9 +80,9 @@ async def test_dispatch_sse_bypasses_auth(self, mock_app, mock_oauth_manager):
         request.cookies = {}
         request.headers = {}
         call_next = AsyncMock(return_value=Mock())
-        
+
         await middleware.dispatch(request, call_next)
-        
+
         call_next.assert_called_once_with(request)
 
     @pytest.mark.asyncio
@@ -96,9 +97,9 @@ async def test_dispatch_public_path_bypasses_auth(self, mock_app, mock_oauth_man
         request.cookies = {}
         request.headers = {}
         call_next = AsyncMock(return_value=Mock())
-        
+
         await middleware.dispatch(request, call_next)
-        
+
         call_next.assert_called_once_with(request)
 
     @pytest.mark.asyncio
@@ -114,9 +115,9 @@ async def test_dispatch_no_session_returns_401(self, mock_app, mock_oauth_manage
         request.headers = {}
         request.query_params = {}
         call_next = AsyncMock(return_value=Mock())
-        
+
         response = await middleware.dispatch(request, call_next)
-        
+
         assert isinstance(response, JSONResponse)
         assert response.status_code == 401
         call_next.assert_not_called()
@@ -124,7 +125,7 @@ async def test_dispatch_no_session_returns_401(self, mock_app, mock_oauth_manage
     @pytest.mark.asyncio
     async def test_dispatch_session_cookie_authenticated(self, mock_app, mock_oauth_manager, oauth_token):
         mock_oauth_manager.get_valid_token = AsyncMock(return_value=oauth_token)
-        
+
         middleware = OAuthMiddleware(
             app=mock_app,
             oauth_manager=mock_oauth_manager,
@@ -137,9 +138,9 @@ async def test_dispatch_session_cookie_authenticated(self, mock_app, mock_oauth_
         request.query_params = {}
         request.state = Mock()
         call_next = AsyncMock(return_value=Mock())
-        
+
         await middleware.dispatch(request, call_next)
-        
+
         mock_oauth_manager.get_valid_token.assert_called_once_with("valid_session_id")
         call_next.assert_called_once_with(request)
         assert request.state.oauth_token == oauth_token
@@ -148,7 +149,7 @@ async def test_dispatch_session_cookie_authenticated(self, mock_app, mock_oauth_
     @pytest.mark.asyncio
     async def test_dispatch_bearer_token_authenticated(self, mock_app, mock_oauth_manager, oauth_token):
         mock_oauth_manager.get_valid_token = AsyncMock(return_value=oauth_token)
-        
+
         middleware = OAuthMiddleware(
             app=mock_app,
             oauth_manager=mock_oauth_manager,
@@ -161,16 +162,16 @@ async def test_dispatch_bearer_token_authenticated(self, mock_app, mock_oauth_ma
         request.query_params = {}
         request.state = Mock()
         call_next = AsyncMock(return_value=Mock())
-        
+
         await middleware.dispatch(request, call_next)
-        
+
         mock_oauth_manager.get_valid_token.assert_called_once_with("valid_token")
         call_next.assert_called_once_with(request)
 
     @pytest.mark.asyncio
     async def test_dispatch_query_param_session_authenticated(self, mock_app, mock_oauth_manager, oauth_token):
         mock_oauth_manager.get_valid_token = AsyncMock(return_value=oauth_token)
-        
+
         middleware = OAuthMiddleware(
             app=mock_app,
             oauth_manager=mock_oauth_manager,
@@ -183,9 +184,9 @@ async def test_dispatch_query_param_session_authenticated(self, mock_app, mock_o
         request.query_params = {"session_id": "valid_session_id"}
         request.state = Mock()
         call_next = AsyncMock(return_value=Mock())
-        
+
         await middleware.dispatch(request, call_next)
-        
+
         mock_oauth_manager.get_valid_token.assert_called_once_with("valid_session_id")
         call_next.assert_called_once_with(request)
 
@@ -193,7 +194,7 @@ async def test_dispatch_query_param_session_authenticated(self, mock_app, mock_o
     async def test_dispatch_invalid_session_returns_401(self, mock_app, mock_oauth_manager):
         """Test that invalid/expired session returns 401"""
         mock_oauth_manager.get_valid_token = AsyncMock(return_value=None)
-        
+
         middleware = OAuthMiddleware(
             app=mock_app,
             oauth_manager=mock_oauth_manager,
@@ -206,9 +207,9 @@ async def test_dispatch_invalid_session_returns_401(self, mock_app, mock_oauth_m
         request.query_params = {}
         request.state = Mock()
         call_next = AsyncMock(return_value=Mock())
-        
+
         response = await middleware.dispatch(request, call_next)
-        
+
         assert isinstance(response, JSONResponse)
         assert response.status_code == 401
         call_next.assert_not_called()
@@ -216,7 +217,7 @@ async def test_dispatch_invalid_session_returns_401(self, mock_app, mock_oauth_m
     @pytest.mark.asyncio
     async def test_dispatch_exception_returns_500(self, mock_app, mock_oauth_manager):
         mock_oauth_manager.get_valid_token = AsyncMock(side_effect=Exception("Database error"))
-        
+
         middleware = OAuthMiddleware(
             app=mock_app,
             oauth_manager=mock_oauth_manager,
@@ -229,9 +230,9 @@ async def test_dispatch_exception_returns_500(self, mock_app, mock_oauth_manager
         request.query_params = {}
         request.state = Mock()
         call_next = AsyncMock(return_value=Mock())
-        
+
         response = await middleware.dispatch(request, call_next)
-        
+
         assert isinstance(response, JSONResponse)
         assert response.status_code == 500
 
@@ -240,17 +241,17 @@ class TestGetOAuthTokenFromRequest:
     def test_with_oauth_token(self, oauth_token):
         request = Mock(spec=Request)
         request.state.oauth_token = oauth_token
-        
+
         token = get_oauth_token_from_request(request)
-        
+
         assert token == "test_access_token"
 
     def test_without_oauth_token(self):
         request = Mock(spec=Request)
         del request.state.oauth_token
-        
+
         token = get_oauth_token_from_request(request)
-        
+
         assert token is None
 
 class TestRequireAuth:
@@ -258,18 +259,18 @@ class TestRequireAuth:
     def test_with_valid_token(self, oauth_token):
         request = Mock(spec=Request)
         request.state.oauth_token = oauth_token
-        
+
         token = require_auth(request)
-        
+
         assert token == "test_access_token"
 
     def test_without_token_raises_401(self):
-        
+
         request = Mock(spec=Request)
         del request.state.oauth_token
-        
+
         with pytest.raises(HTTPException) as exc_info:
             require_auth(request)
-        
+
         assert exc_info.value.status_code == 401
         assert exc_info.value.detail == "Authentication required"
diff --git a/tests/app/mcp/oauth/test_oauth.py b/tests/app/mcp/oauth/test_oauth.py
index c1e9ec4d..097a777f 100644
--- a/tests/app/mcp/oauth/test_oauth.py
+++ b/tests/app/mcp/oauth/test_oauth.py
@@ -1,3 +1,4 @@
+import os
 import pytest
 from unittest.mock import AsyncMock, Mock, patch
 from datetime import datetime, timedelta
@@ -48,7 +49,7 @@ class TestOAuthProvider:
 
     def test_generate_authorization_url_without_state(self, oauth_provider):
         auth_url, state = oauth_provider.generate_authorization_url()
-        
+
         assert auth_url.startswith("https://example.com/auth?")
         assert "client_id=test_client_id" in auth_url
         assert "redirect_uri=http://localhost:8080/oauth/callback" in auth_url
@@ -57,34 +58,34 @@ def test_generate_authorization_url_without_state(self, oauth_provider):
         assert "state=" in auth_url
         assert len(state) == 43
         assert state in oauth_provider._state_store
- 
+
     def test_generate_authorization_url_with_state(self, oauth_provider):
         custom_state = "custom_state_123"
         _, state = oauth_provider.generate_authorization_url(state=custom_state)
-        
+
         assert state == custom_state
         assert custom_state in oauth_provider._state_store
 
     def test_verify_state_valid(self, oauth_provider):
         _, state = oauth_provider.generate_authorization_url()
-        
+
         result = oauth_provider.verify_state(state)
-        
+
         assert result is True
         assert state not in oauth_provider._state_store
 
     def test_verify_state_invalid_not_found(self, oauth_provider):
         result = oauth_provider.verify_state("nonexistent_state")
-        
+
         assert result is False
 
     def test_verify_state_expired(self, oauth_provider):
         _, state = oauth_provider.generate_authorization_url()
-        
+
         oauth_provider._state_store[state] = datetime.utcnow() - timedelta(minutes=10)
-        
+
         result = oauth_provider.verify_state(state)
-        
+
         assert result is False
         assert state not in oauth_provider._state_store
 
@@ -99,19 +100,19 @@ async def test_exchange_code_for_token(self, oauth_provider):
                 "refresh_token": "new_refresh_token",
                 "scope": "openid email"
             }
-            
+
             mock_client_instance = AsyncMock()
             mock_client_instance.post.return_value = mock_response
             mock_client.return_value.__aenter__.return_value = mock_client_instance
-            
+
             token = await oauth_provider.exchange_code_for_token("authorization_code")
-            
+
             assert token.access_token == "new_access_token"
             assert token.token_type == "Bearer"
             assert token.expires_in == 3600
             assert token.refresh_token == "new_refresh_token"
             assert token.scope == "openid email"
-    
+
     @pytest.mark.asyncio
     async def test_refresh_access_token(self, oauth_provider):
         with patch("app.mcp.oauth.oauth.httpx.AsyncClient") as mock_client:
@@ -122,16 +123,16 @@ async def test_refresh_access_token(self, oauth_provider):
                 "expires_in": 7200,
                 "refresh_token": "new_refresh_token",
             }
-            
+
             mock_client_instance = AsyncMock()
             mock_client_instance.post.return_value = mock_response
             mock_client.return_value.__aenter__.return_value = mock_client_instance
-            
+
             token = await oauth_provider.refresh_access_token("old_refresh_token")
-            
+
             assert token.access_token == "refreshed_access_token"
             assert token.expires_in == 7200
-    
+
     @pytest.mark.asyncio
     async def test_get_user_info(self, oauth_provider):
         with patch("app.mcp.oauth.oauth.httpx.AsyncClient") as mock_client:
@@ -141,13 +142,13 @@ async def test_get_user_info(self, oauth_provider):
                 "name": "Test User",
                 "email": "test@example.com"
             }
-            
+
             mock_client_instance = AsyncMock()
             mock_client_instance.get.return_value = mock_response
             mock_client.return_value.__aenter__.return_value = mock_client_instance
-            
+
             user_info = await oauth_provider.get_user_info("test_access_token")
-            
+
             assert user_info["sub"] == "1234567890"
             assert user_info["name"] == "Test User"
             assert user_info["email"] == "test@example.com"
@@ -161,7 +162,7 @@ def test_github_oauth_provider_initialization(self):
             "GITHUB_CLIENT_SECRET": "test_github_client_secret"
         }):
             provider = GitHubOAuthProvider(redirect_uri="http://localhost:8080/oauth/callback")
-            
+
             assert provider.config.client_id == "test_github_client_id"
             assert provider.config.client_secret == "test_github_client_secret"
             assert provider.config.authorization_url == "https://github.com/login/oauth/authorize"
@@ -177,14 +178,14 @@ async def test_get_user_info_with_email(self, oauth_token):
                 "name": "Test User",
                 "email": "test@example.com"
             }
-            
+
             mock_client_instance = AsyncMock()
             mock_client_instance.get.return_value = mock_user_response
             mock_client.return_value.__aenter__.return_value = mock_client_instance
-            
+
             provider = GitHubOAuthProvider(redirect_uri="http://localhost:8080/oauth/callback")
             user_info = await provider.get_user_info("test_access_token")
-            
+
             assert user_info["email"] == "test@example.com"
 
 
@@ -197,7 +198,7 @@ def test_google_oauth_provider_initialization(self):
             "GOOGLE_CLIENT_SECRET": "test_google_client_secret"
         }):
             provider = GoogleOAuthProvider(redirect_uri="http://localhost:8080/oauth/callback")
-            
+
             assert provider.config.client_id == "test_google_client_id"
             assert provider.config.client_secret == "test_google_client_secret"
             assert provider.config.authorization_url == "https://accounts.google.com/o/oauth2/v2/auth"
@@ -206,13 +207,11 @@ def test_google_oauth_provider_initialization(self):
 
     def test_google_oauth_provider_missing_credentials(self):
         with patch.dict("os.environ", {}, clear=True):
-            import os
-            # Ensure env vars are not set
             os.environ.pop("GOOGLE_CLIENT_ID", None)
             os.environ.pop("GOOGLE_CLIENT_SECRET", None)
-            
+
             provider = GoogleOAuthProvider(redirect_uri="http://localhost:8080/oauth/callback")
-            
+
             assert provider.config.client_id == ""
             assert provider.config.client_secret == ""
 
@@ -227,35 +226,35 @@ def test_get_provider_github(self, oauth_manager):
 
     def test_get_provider_google(self, oauth_manager):
         provider = oauth_manager.get_provider("google")
-        
+
         assert provider is not None
         assert isinstance(provider, GoogleOAuthProvider)
 
     def test_get_provider_unknown(self, oauth_manager):
         provider = oauth_manager.get_provider("unknown")
-        
+
         assert provider is None
 
     def test_store_and_get_token(self, oauth_manager, oauth_token):
         session_id = "test_session_123"
-        
+
         oauth_manager.store_token(session_id, oauth_token)
-        
+
         retrieved_token = oauth_manager.get_token(session_id)
         assert retrieved_token is not None
         assert retrieved_token.access_token == "test_access_token"
 
     def test_get_token_nonexistent(self, oauth_manager):
         token = oauth_manager.get_token("nonexistent_session")
-        
+
         assert token is None
 
     def test_remove_token(self, oauth_manager, oauth_token):
         session_id = "test_session_123"
-        
+
         oauth_manager.store_token(session_id, oauth_token)
         oauth_manager.remove_token(session_id)
-        
+
         token = oauth_manager.get_token(session_id)
         assert token is None
 
@@ -263,9 +262,9 @@ def test_remove_token(self, oauth_manager, oauth_token):
     async def test_get_valid_token(self, oauth_manager, oauth_token):
         session_id = "test_session_123"
         oauth_manager.store_token(session_id, oauth_token)
-        
+
         token = await oauth_manager.get_valid_token(session_id)
-        
+
         assert token is not None
         assert token.access_token == "test_access_token"
 
@@ -277,14 +276,14 @@ async def test_get_valid_token_expired_no_refresh(self, oauth_manager):
             expires_in=1,
             refresh_token=None
         )
-        
+
         object.__setattr__(expired_token, "created_at", datetime.utcnow() - timedelta(hours=2))
-        
+
         session_id = "test_session_123"
         oauth_manager.store_token(session_id, expired_token)
-        
+
         token = await oauth_manager.get_valid_token(session_id)
-        
+
         assert token is None
 
     @pytest.mark.asyncio
@@ -296,25 +295,39 @@ async def test_get_valid_token_expired_with_refresh(self, oauth_manager, oauth_t
             refresh_token="refresh_token"
         )
         object.__setattr__(expired_token, "created_at", datetime.utcnow() - timedelta(hours=2))
-        
+
         session_id = "test_session_123"
         oauth_manager.store_token(session_id, expired_token)
-        
+
         for provider in oauth_manager.providers.values():
             provider.refresh_access_token = AsyncMock(return_value=oauth_token)
-        
+
         token = await oauth_manager.get_valid_token(session_id)
-        
+
         assert token is not None
         assert token.access_token == "test_access_token"
 
     def test_create_oauth_routes(self, oauth_manager):
         routes = oauth_manager.create_oauth_routes()
-        
-        assert len(routes) == 5
+
+        assert len(routes) == 21
         route_paths = [route.path for route in routes]
-        assert "/oauth/login" in route_paths
-        assert "/oauth/authorize/{provider}" in route_paths
-        assert "/oauth/callback/{provider}" in route_paths
-        assert "/oauth/status" in route_paths
-        assert "/oauth/logout" in route_paths
+        expected_paths = {
+            "/.well-known",
+            "/.well-known/",
+            "/.well-known/oauth-authorization-server",
+            "/.well-known/oauth-protected-resource",
+            "/.well-known/oauth-protected-resource/sse",
+            "/.well-known/openid-configuration",
+            "/register",
+            "/oauth/register",
+            "/authorize",
+            "/oauth/login",
+            "/oauth/authorize",
+            "/oauth/authorize/{provider}",
+            "/oauth/callback/{provider}",
+            "/oauth/status",
+            "/oauth/logout",
+        }
+        for path in expected_paths:
+            assert path in route_paths
diff --git a/tests/app/model_services/test_huggingface_llm_model.py b/tests/app/model_services/test_huggingface_llm_model.py
index 3e61d1f5..ce8ed2ac 100644
--- a/tests/app/model_services/test_huggingface_llm_model.py
+++ b/tests/app/model_services/test_huggingface_llm_model.py
@@ -1,11 +1,23 @@
 import os
 import pytest
+import torch
+from concurrent.futures import Future
 from unittest.mock import MagicMock, patch
 from tests.app.conftest import MODEL_PARENT_DIR
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from app import __version__
-from app.domain import ModelType
+from app.domain import ModelType, GenerationResult
 from app.model_services.huggingface_llm_model import HuggingFaceLlmModel, TimeoutCriteria
+from app.exception import GenerationException
+
+
+class _TokenBatch:
+    def __init__(self, length: int = 2):
+        self.input_ids = torch.tensor([[0] * length])
+        self.attention_mask = torch.ones((1, length), dtype=torch.long)
+
+    def to(self, _device):
+        return self
 
 
 def test_model_name(huggingface_llm_model):
@@ -29,6 +41,18 @@ def test_init_model(huggingface_llm_model):
     assert huggingface_llm_model.tokenizer is not None
 
 
+def test_init_model_sets_override_chat_template(huggingface_llm_model):
+    huggingface_llm_model._config.OVERRIDE_CHAT_TEMPLATE = "{{ custom_template }}"
+    huggingface_llm_model.init_model()
+    assert huggingface_llm_model.tokenizer.chat_template == "{{ custom_template }}"
+
+
+def test_init_model_skips_override_chat_template(huggingface_llm_model):
+    huggingface_llm_model._config.OVERRIDE_CHAT_TEMPLATE = ""
+    huggingface_llm_model.init_model()
+    assert huggingface_llm_model.tokenizer.chat_template != ""
+
+
 def test_load_model(huggingface_llm_model):
     model, tokenizer = HuggingFaceLlmModel.load_model(os.path.join(MODEL_PARENT_DIR, "huggingface_llm_model.tar.gz"))
     assert isinstance(model, PreTrainedModel)
@@ -52,13 +76,14 @@ def test_generate(huggingface_llm_model, ensure_full_sentences, expected_output)
     huggingface_llm_model._micro_batch_scheduler._batch_wait_milliseconds = 1
     huggingface_llm_model.model = MagicMock()
     huggingface_llm_model.tokenizer = MagicMock()
+    huggingface_llm_model._assistant_model = MagicMock()
+    huggingface_llm_model._assistant_tokenizer = MagicMock()
     mock_send_metrics = MagicMock()
-    inputs = MagicMock()
-    inputs.input_ids = MagicMock(shape=[1, 2])
-    inputs.attention_mask = MagicMock()
-    inputs.attention_mask.sum.return_value.tolist.return_value = [2]
+    inputs = _TokenBatch(length=2)
     huggingface_llm_model.tokenizer.return_value = inputs
     huggingface_llm_model.tokenizer.pad_token_id = 2
+    huggingface_llm_model.tokenizer.vocab_size = 2
+    huggingface_llm_model._assistant_tokenizer.vocab_size = 2
     outputs = [MagicMock(shape=[2])]
     huggingface_llm_model.model.generate.return_value = outputs
     completion_ids = MagicMock()
@@ -80,15 +105,15 @@ def test_generate(huggingface_llm_model, ensure_full_sentences, expected_output)
     )
 
     huggingface_llm_model.tokenizer.assert_any_call(
-        ["chat template text"],
+        ["Alright?"],
         add_special_tokens=False,
         return_tensors="pt",
         padding=True,
     )
     huggingface_llm_model.model.generate.assert_called_once()
     call_kwargs = huggingface_llm_model.model.generate.call_args.kwargs
-    assert call_kwargs["inputs"] == inputs.input_ids
-    assert call_kwargs["attention_mask"] == inputs.attention_mask
+    assert torch.equal(call_kwargs["inputs"], inputs.input_ids)
+    assert torch.equal(call_kwargs["attention_mask"], inputs.attention_mask)
     assert call_kwargs["min_new_tokens"] == 50
     assert call_kwargs["max_new_tokens"] == 128
     assert call_kwargs["use_cache"] is True
@@ -99,17 +124,64 @@ def test_generate(huggingface_llm_model, ensure_full_sentences, expected_output)
     assert call_kwargs["repetition_penalty"] == 1.2
     assert call_kwargs["no_repeat_ngram_size"] == 3
     assert call_kwargs["pad_token_id"] == 2
+    assert call_kwargs["assistant_model"] == huggingface_llm_model._assistant_model
+    assert call_kwargs["assistant_confidence_threshold"] == 0.4
+    assert call_kwargs["num_assistant_tokens"] == 5
     assert "stopping_criteria" in call_kwargs
     huggingface_llm_model.tokenizer.decode.assert_called_once_with(
         outputs[0][2:],
         skip_special_tokens=True,
     )
-    mock_send_metrics.assert_called_once_with(
-        prompt_token_num=2,
-        completion_token_num=2,
+    mock_send_metrics.assert_called_once()
+    metric_kwargs = mock_send_metrics.call_args.kwargs
+    assert metric_kwargs["prompt_token_num"] == 2
+    assert metric_kwargs["completion_token_num"] == 2
+    assert metric_kwargs["ttft_milliseconds"] >= -1
+    assert metric_kwargs["tpot_milliseconds"] >= -1
+    assert result.text == expected_output
+    assert "[STOP]" not in result.text
+
+
+def test_generate_with_structured_output(huggingface_llm_model):
+    huggingface_llm_model.init_model()
+    model = MagicMock()
+    model.generate.return_value = GenerationResult(
+        text="Yeah.",
+        prompt_token_num=1,
+        completion_token_num=1,
     )
-    assert result == expected_output
-    assert "[STOP]" not in result
+    huggingface_llm_model.model = model
+    captured = {}
+    json_schema_parser = MagicMock()
+    huggingface_llm_model._get_schema_hash = MagicMock(return_value="schema_hash")
+    prefix_fn = MagicMock()
+
+    def _submit(request):
+        captured.update(request)
+        future = Future()
+        request["future"] = future
+        with patch.object(
+            huggingface_llm_model,
+            "_build_transformers_prefix_allowed_tokens_fn",
+            return_value=prefix_fn,
+        ):
+            model.generate(prefix_allowed_tokens_fn=prefix_fn)
+        future.set_result(model.generate.return_value)
+        return future
+
+    huggingface_llm_model._micro_batch_scheduler.submit = _submit
+
+    result = huggingface_llm_model.generate(
+        prompt="This is a test prompt",
+        min_tokens=1,
+        max_tokens=2,
+        json_schema_parser=json_schema_parser,
+    )
+
+    assert result.text == "Yeah."
+    assert captured["json_schema_parser"] == json_schema_parser
+    assert captured["batch_key"][-2] == "schema_hash"
+    assert model.generate.call_args.kwargs["prefix_allowed_tokens_fn"] == prefix_fn
 
 
 @pytest.mark.parametrize("ensure_full_sentences, stream_chunks, stop_sequences, expected_output, report_called", [
@@ -128,19 +200,19 @@ async def test_generate_async(
     huggingface_llm_model.init_model()
     huggingface_llm_model.model = MagicMock()
     huggingface_llm_model.tokenizer = MagicMock()
+    huggingface_llm_model._assistant_model = MagicMock()
+    huggingface_llm_model._assistant_tokenizer = MagicMock()
     mock_send_metrics = MagicMock()
-    inputs = MagicMock()
-    inputs.input_ids = MagicMock(shape=[1, 2])
-    inputs.attention_mask = MagicMock()
+    inputs = _TokenBatch(length=2)
     
     def mock_tokenizer_call(*args, **kwargs):
         if args and args[0] == "Alright?":
             return inputs
-        mock_result = MagicMock()
-        mock_result.input_ids = MagicMock(shape=[1, 2])
-        return mock_result
+        return _TokenBatch(length=2)
     
     huggingface_llm_model.tokenizer.side_effect = mock_tokenizer_call
+    huggingface_llm_model.tokenizer.vocab_size = 2
+    huggingface_llm_model._assistant_tokenizer.vocab_size = 2
     streamer = FakeAsyncTextIteratorStreamer(stream_chunks)
     
     with patch("app.model_services.huggingface_llm_model.AsyncTextIteratorStreamer", return_value=streamer):
@@ -160,16 +232,22 @@ def mock_tokenizer_call(*args, **kwargs):
             report_tokens=mock_send_metrics,
             ensure_full_sentences=ensure_full_sentences,
         ):
-            results.append(chunk)
+            if isinstance(chunk, str):
+                results.append(chunk)
         result = "".join(results)
         submit_kwargs = huggingface_llm_model._text_generator.submit.call_args.kwargs
         assert "stopping_criteria" in submit_kwargs
+        assert submit_kwargs["assistant_model"] == huggingface_llm_model._assistant_model
+        assert submit_kwargs["assistant_confidence_threshold"] == 0.4
+        assert submit_kwargs["num_assistant_tokens"] == 5
 
     if report_called:
-        mock_send_metrics.assert_called_once_with(
-            prompt_token_num=2,
-            completion_token_num=2,
-        )
+        mock_send_metrics.assert_called_once()
+        metric_kwargs = mock_send_metrics.call_args.kwargs
+        assert metric_kwargs["prompt_token_num"] == 2
+        assert metric_kwargs["completion_token_num"] == 2
+        assert metric_kwargs["ttft_milliseconds"] >= -1
+        assert metric_kwargs["tpot_milliseconds"] >= -1
     else:
         mock_send_metrics.assert_not_called()
     assert result == expected_output
@@ -183,16 +261,12 @@ async def test_generate_async_with_timeout(huggingface_llm_model):
     huggingface_llm_model._generation_timeout_secs = 2
     huggingface_llm_model.model = MagicMock()
     huggingface_llm_model.tokenizer = MagicMock()
-    inputs = MagicMock()
-    inputs.input_ids = MagicMock(shape=[1, 2])
-    inputs.attention_mask = MagicMock()
+    inputs = _TokenBatch(length=2)
 
     def _mock_tokenizer_call(*args, **kwargs):
         if args and args[0] == "Alright?":
             return inputs
-        mock_result = MagicMock()
-        mock_result.input_ids = MagicMock(shape=[1, 2])
-        return mock_result
+        return _TokenBatch(length=2)
 
     huggingface_llm_model.tokenizer.side_effect = _mock_tokenizer_call
     streamer = FakeAsyncTextIteratorStreamer(["OK"])
@@ -203,7 +277,8 @@ def _mock_tokenizer_call(*args, **kwargs):
         huggingface_llm_model._text_generator.submit = MagicMock(return_value=MagicMock())
         results = []
         async for chunk in huggingface_llm_model.generate_async(prompt="Alright?"):
-            results.append(chunk)
+            if isinstance(chunk, str):
+                results.append(chunk)
 
         submit_kwargs = huggingface_llm_model._text_generator.submit.call_args.kwargs
         mock_streamer.assert_called_once()
@@ -214,6 +289,26 @@ def _mock_tokenizer_call(*args, **kwargs):
         assert isinstance(submit_kwargs["stopping_criteria"][0], TimeoutCriteria)
 
 
+@pytest.mark.asyncio
+async def test_generate_async_with_generation_exception(huggingface_llm_model):
+    huggingface_llm_model.init_model()
+    huggingface_llm_model.model = MagicMock()
+    huggingface_llm_model.tokenizer = MagicMock()
+    inputs = _TokenBatch(length=2)
+
+    def _mock_tokenizer_call(*args, **kwargs):
+        if args and args[0] == "Alright?":
+            return inputs
+        return _TokenBatch(length=2)
+
+    huggingface_llm_model.tokenizer.side_effect = _mock_tokenizer_call
+    huggingface_llm_model._text_generator.submit = MagicMock(side_effect=RuntimeError("submit failed"))
+
+    with pytest.raises(GenerationException):
+        async for _ in huggingface_llm_model.generate_async(prompt="Alright?"):
+            pass
+
+
 @patch("torch.nn.functional.normalize")
 @patch("torch.mean")
 @patch("torch.cat")
diff --git a/tests/app/model_services/test_huggingface_ner_model.py b/tests/app/model_services/test_huggingface_ner_model.py
index 44779da0..ec3cbd1f 100644
--- a/tests/app/model_services/test_huggingface_ner_model.py
+++ b/tests/app/model_services/test_huggingface_ner_model.py
@@ -1,8 +1,6 @@
 import os
 import tempfile
 from unittest.mock import Mock
-import pandas as pd
-import pytest
 from tests.app.conftest import MODEL_PARENT_DIR
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from app import __version__
@@ -46,6 +44,7 @@ def test_info(huggingface_ner_model):
 
 
 def test_annotate(huggingface_ner_model):
+    huggingface_ner_model._config.TRAINING_HF_NER_TAGGING_SCHEME = "flat"
     huggingface_ner_model._confidence_threshold = 0.01
     annotations = huggingface_ner_model.annotate(
         """The patient is a 60-year-old female, who complained of coughing during meals. """
@@ -81,6 +80,36 @@ def test_annotate(huggingface_ner_model):
     assert len(annotations[0].text) > 0
 
 
+def test_annotate_with_confidence_threshold(huggingface_ner_model):
+    huggingface_ner_model._config.CONFIDENCE_SCORE_THRESHOLD = 1.1
+    annotations = huggingface_ner_model.annotate("This is a test.")
+    assert len(annotations) == 0
+
+
+def test_batch_annotate(huggingface_ner_model):
+    huggingface_ner_model._config.TRAINING_HF_NER_TAGGING_SCHEME = "iob"
+    huggingface_ner_model._config.HF_NER_APPLY_VITERBI_DECODING = "false"
+    huggingface_ner_model._viterbi_decoder = None
+    huggingface_ner_model._config.INCLUDE_SPAN_TEXT = "false"
+    huggingface_ner_model._confidence_threshold = 0.5
+    huggingface_ner_model._ner_pipeline = Mock(return_value=[
+        [
+            {"entity": "B-LABEL", "score": 0.9, "index": 0, "start": 0, "end": 5},
+        ],
+        [
+            {"entity": "I-LABEL", "score": 0.8, "index": 1, "start": 4, "end": 7},
+        ],
+    ])
+    texts = ["hello world", "foo bar"]
+    annotations_batch = huggingface_ner_model.batch_annotate(texts)
+    assert len(annotations_batch) == 2
+    assert len(annotations_batch[0]) == 1
+    assert annotations_batch[0][0].label_name == "Label"
+    assert annotations_batch[0][0].start == 0
+    assert annotations_batch[0][0].end == 5
+    assert len(annotations_batch[1]) == 1
+
+
 def test_train_unsupervised(huggingface_ner_model):
     huggingface_ner_model.init_model()
     huggingface_ner_model._config.REDEPLOY_TRAINED_MODEL = "false"
diff --git a/tests/app/model_services/test_medcat_model_deid.py b/tests/app/model_services/test_medcat_model_deid.py
index 8c16d90a..321b2ffc 100644
--- a/tests/app/model_services/test_medcat_model_deid.py
+++ b/tests/app/model_services/test_medcat_model_deid.py
@@ -53,6 +53,19 @@ def test_info(medcat_deid_model):
     assert model_card.model_type == ModelType.ANONCAT
 
 
+@pytest.mark.skipif(
+    not os.path.exists(os.path.join(MODEL_PARENT_DIR, "deid_model.zip")),
+    reason="requires the model file to be present in the resources folder",
+)
+def test_annotate_with_confidence_threshold(medcat_deid_model):
+    medcat_deid_model.init_model()
+    original_threshold = medcat_deid_model._config.CONFIDENCE_SCORE_THRESHOLD
+    medcat_deid_model._config.CONFIDENCE_SCORE_THRESHOLD = 1.1
+    annotations = medcat_deid_model.annotate("NW1 2DA")
+    assert len(annotations) == 0
+    medcat_deid_model._config.CONFIDENCE_SCORE_THRESHOLD = original_threshold
+
+
 @pytest.mark.skipif(
     not os.path.exists(os.path.join(MODEL_PARENT_DIR, "deid_model.zip")),
     reason="requires the model file to be present in the resources folder",
diff --git a/tests/app/model_services/test_medcat_model_icd10.py b/tests/app/model_services/test_medcat_model_icd10.py
index c019c455..12d07c5b 100644
--- a/tests/app/model_services/test_medcat_model_icd10.py
+++ b/tests/app/model_services/test_medcat_model_icd10.py
@@ -153,4 +153,4 @@ def test_create_embeddings(medcat_umls_model):
     for emb in embeddings:
         assert isinstance(emb, list)
         assert len(emb) > 0
-        assert all(isinstance(x, float) for x in emb)
+        assert all(isinstance(x, float) for x in emb)
\ No newline at end of file
diff --git a/tests/app/model_services/test_medcat_model_snomed.py b/tests/app/model_services/test_medcat_model_snomed.py
index b4a4ae7f..38012d14 100644
--- a/tests/app/model_services/test_medcat_model_snomed.py
+++ b/tests/app/model_services/test_medcat_model_snomed.py
@@ -90,6 +90,19 @@ def test_info(medcat_snomed_model):
     assert model_card.model_type == ModelType.MEDCAT_SNOMED
 
 
+@pytest.mark.skipif(
+    not os.path.exists(os.path.join(MODEL_PARENT_DIR, "snomed_model.zip")),
+    reason="requires the model file to be present in the resources folder",
+)
+def test_annotate_with_confidence_threshold(medcat_snomed_model):
+    medcat_snomed_model.init_model()
+    original_threshold = medcat_snomed_model._config.CONFIDENCE_SCORE_THRESHOLD
+    medcat_snomed_model._config.CONFIDENCE_SCORE_THRESHOLD = 1.1
+    annotations = medcat_snomed_model.annotate("Spinal stenosis")
+    assert len(annotations) == 0
+    medcat_snomed_model._config.CONFIDENCE_SCORE_THRESHOLD = original_threshold
+
+
 @pytest.mark.skipif(
     not os.path.exists(os.path.join(MODEL_PARENT_DIR, "snomed_model.zip")),
     reason="requires the model file to be present in the resources folder",
diff --git a/tests/app/monitoring/test_tracker_client.py b/tests/app/monitoring/test_tracker_client.py
index 1029a0ff..d4a164a6 100644
--- a/tests/app/monitoring/test_tracker_client.py
+++ b/tests/app/monitoring/test_tracker_client.py
@@ -25,13 +25,12 @@ def test_start_new(mlflow_fixture):
 
     mlflow.get_experiment_by_name.assert_called_once_with("model_name_training_type")
     mlflow.create_experiment.assert_called_once_with(name="model_name_training_type")
-    mlflow.start_run.assert_called_once_with(experiment_id="experiment_id", tags=ANY)
+    mlflow.start_run.assert_called_once_with(experiment_id="experiment_id", run_name="run_name", tags=ANY)
     mlflow.log_params.assert_called_once_with({"param": "param"})
     _, kwargs = mlflow.start_run.call_args
     assert experiment_id == "experiment_id"
     assert run_id == "run_id"
     assert "mlflow.source.name" in kwargs["tags"]
-    assert "mlflow.runName" in kwargs["tags"]
     assert "mlflow.note.content" in kwargs["tags"]
     assert "training.input_data.filename" in kwargs["tags"]
     assert "training.base_model.origin" in kwargs["tags"]
@@ -304,6 +303,13 @@ def test_log_document_size(mlflow_fixture):
 
     mlflow.set_tag.assert_called_once_with("training.document.size", "10")
 
+def test_log_training_token_count(mlflow_fixture):
+    tracker_client = TrackerClient("")
+
+    tracker_client.log_training_token_count(1000)
+
+    mlflow.set_tag.assert_called_once_with("training.token.count", "1000")
+
 
 def test_log_model_config(mlflow_fixture):
     tracker_client = TrackerClient("")
diff --git a/tests/app/processors/test_lora_adaptor.py b/tests/app/processors/test_lora_adaptor.py
new file mode 100644
index 00000000..2fa0af2d
--- /dev/null
+++ b/tests/app/processors/test_lora_adaptor.py
@@ -0,0 +1,168 @@
+import pytest
+from unittest.mock import Mock, patch
+from app.exception import ManagedModelException
+from app.processors.lora_adaptor import LoraAdaptor
+
+
+@patch("app.processors.lora_adaptor.get_peft_model")
+@patch("app.processors.lora_adaptor.LoraConfig")
+def test_apply_uses_explicit_target_modules(mock_lora_config, mock_get_peft_model):
+    model = Mock()
+    peft_model = Mock()
+    lora_config = Mock()
+    mock_lora_config.return_value = lora_config
+    mock_get_peft_model.return_value = peft_model
+
+    result_model, result_config = LoraAdaptor.apply(
+        model=model,
+        task_type="TOKEN_CLS",
+        target_modules=["q_proj", "k_proj"],
+        r=16,
+        lora_alpha=64,
+        lora_dropout=0.2,
+    )
+
+    assert result_model is peft_model
+    assert result_config is lora_config
+    mock_lora_config.assert_called_once_with(
+        task_type="TOKEN_CLS",
+        r=16,
+        lora_alpha=64,
+        lora_dropout=0.2,
+        target_modules=["q_proj", "k_proj"],
+    )
+    mock_get_peft_model.assert_called_once_with(model, lora_config)
+
+
+@patch(
+    "app.processors.lora_adaptor.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING",
+    {"bert": ["query", "key", "value"]},
+)
+@patch("app.processors.lora_adaptor.get_peft_model")
+@patch("app.processors.lora_adaptor.LoraConfig")
+def test_apply_uses_peft_mapping_when_target_modules_omitted(mock_lora_config, mock_get_peft_model):
+    model = Mock()
+    model.config.model_type = "bert"
+    peft_model = Mock()
+    lora_config = Mock()
+    mock_lora_config.return_value = lora_config
+    mock_get_peft_model.return_value = peft_model
+
+    result_model, result_config = LoraAdaptor.apply(
+        model=model,
+        task_type="TOKEN_CLS",
+    )
+
+    assert result_model is peft_model
+    assert result_config is lora_config
+    mock_lora_config.assert_called_once_with(
+        task_type="TOKEN_CLS",
+        r=8,
+        lora_alpha=32,
+        lora_dropout=0.1,
+        target_modules=["query", "key", "value"],
+    )
+
+
+@patch(
+    "app.processors.lora_adaptor.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING",
+    {"bert": ["query", "key", "value"]},
+)
+@patch("app.processors.lora_adaptor.get_peft_model")
+@patch("app.processors.lora_adaptor.LoraConfig")
+def test_apply_falls_back_to_detected_target_modules_when_mapping_missing(mock_lora_config, mock_get_peft_model):
+    class _LeafModule:
+        def children(self):
+            return iter(())
+
+    class _DummyModel:
+        config = Mock(model_type="unknown")
+
+        def named_modules(self):
+            return iter([
+                ("", Mock()),
+                ("encoder.layer.0.attention.q_proj", _LeafModule()),
+                ("encoder.layer.0.attention.k_proj", _LeafModule()),
+                ("encoder.layer.0.attention.v_proj", _LeafModule()),
+            ])
+
+    model = _DummyModel()
+    peft_model = Mock()
+    lora_config = Mock()
+    mock_lora_config.return_value = lora_config
+    mock_get_peft_model.return_value = peft_model
+
+    result_model, result_config = LoraAdaptor.apply(
+        model=model,  # type: ignore[arg-type]
+        task_type="TOKEN_CLS",
+    )
+
+    assert result_model is peft_model
+    assert result_config is lora_config
+    mock_lora_config.assert_called_once_with(
+        task_type="TOKEN_CLS",
+        r=8,
+        lora_alpha=32,
+        lora_dropout=0.1,
+        target_modules=["q_proj", "k_proj", "v_proj"],
+    )
+
+
+@patch(
+    "app.processors.lora_adaptor.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING",
+    {"bert": ["query", "key", "value"]},
+)
+@patch("app.processors.lora_adaptor.get_peft_model")
+@patch("app.processors.lora_adaptor.LoraConfig")
+def test_apply_with_detected_target_modules_when_peft_rejects_configured_modules(mock_lora_config, mock_get_peft_model):
+    class _LeafModule:
+        def children(self):
+            return iter(())
+
+    class _DummyModel:
+        config = Mock(model_type="bert")
+
+        def named_modules(self):
+            return iter([
+                ("", Mock()),
+                ("encoder.layer.0.attention.q_proj", _LeafModule()),
+                ("encoder.layer.0.attention.k_proj", _LeafModule()),
+                ("encoder.layer.0.attention.v_proj", _LeafModule()),
+            ])
+
+    model = _DummyModel()
+    peft_model = Mock()
+    initial_lora_config = Mock()
+    fallback_lora_config = Mock()
+    mock_lora_config.side_effect = [initial_lora_config, fallback_lora_config]
+    mock_get_peft_model.side_effect = [
+        ValueError("Target modules {'value', 'key', 'query'} not found in the base model."),
+        peft_model,
+    ]
+
+    result_model, result_config = LoraAdaptor.apply(
+        model=model,  # type: ignore[arg-type]
+        task_type="TOKEN_CLS",
+    )
+
+    assert result_model is peft_model
+    assert result_config is fallback_lora_config
+    assert mock_lora_config.call_count == 2
+    assert mock_get_peft_model.call_count == 2
+    fallback_call_kwargs = mock_lora_config.call_args_list[1].kwargs
+    assert fallback_call_kwargs["target_modules"] == ["q_proj", "k_proj", "v_proj"]
+
+
+@patch("app.processors.lora_adaptor.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING", {})
+def test_apply_raises_when_no_target_modules_can_be_resolved():
+    model = Mock()
+    model.config.model_type = "unknown"
+    model.named_modules.return_value = iter([])
+
+    with pytest.raises(ManagedModelException) as exc_info:
+        LoraAdaptor.apply(
+            model=model,
+            task_type="TOKEN_CLS",
+        )
+
+    assert "Could not determine LoRA target modules" in str(exc_info.value)
diff --git a/tests/app/processors/test_metrics_collector.py b/tests/app/processors/test_metrics_collector.py
index 6e532758..cbcb75a0 100644
--- a/tests/app/processors/test_metrics_collector.py
+++ b/tests/app/processors/test_metrics_collector.py
@@ -17,6 +17,12 @@
 from app.domain import Annotation
 from app.utils import load_pydantic_object_from_dict
 
+EXPECTED_SANITY_CHECK_CONCEPTS = {
+    "C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860",
+    "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454",
+    "C0042029", "C0155626", "C0878544",
+}
+
 
 @pytest.fixture
 def model_service():
@@ -51,10 +57,10 @@ def test_sanity_check_model_with_trainer_export_path(model_service):
     assert precision == 0.5
     assert recall == 0.07142857142857142
     assert f1 == 0.125
-    assert set(per_cui_prec.keys()) == {"C0017168", "C0020538"}
-    assert set(per_cui_rec.keys()) == {"C0017168", "C0020538"}
-    assert set(per_cui_f1.keys()) == {"C0017168", "C0020538"}
-    assert set(per_cui_name.keys()) == {"C0017168", "C0020538"}
+    assert set(per_cui_prec.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0878544"}
+    assert set(per_cui_rec.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0878544"}
+    assert set(per_cui_f1.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0878544"}
+    assert set(per_cui_name.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0878544"}
     assert per_cui_anchors is None
 
 
@@ -82,12 +88,17 @@ def test_evaluate_model_and_return_dataframe(model_service):
     path = os.path.join(os.path.join(os.path.dirname(__file__), "..", "..", "resources"), "fixture", "trainer_export.json")
 
     result = sanity_check_model_with_trainer_export(path, model_service, return_df=True)
-
-    assert set(result["concept"].to_list()) == {"C0020538", "C0017168"}
-    assert set(result["name"].to_list()) == {"gastroesophageal reflux", "hypertension"}
-    assert set(result["precision"].to_list()) == {0.5, 0.5}
-    assert set(result["recall"].to_list()) == {0.25, 1.0}
-    assert set(result["f1"].to_list()) == {0.3333333333333333, 0.6666666666666666}
+    result_by_concept = result.set_index("concept")
+
+    assert set(result["concept"].to_list()) == EXPECTED_SANITY_CHECK_CONCEPTS
+    assert result_by_concept.loc["C0017168", "name"] == "gastroesophageal reflux"
+    assert result_by_concept.loc["C0020538", "name"] == "hypertension"
+    assert result_by_concept.loc["C0017168", "precision"] == 0.5
+    assert result_by_concept.loc["C0017168", "recall"] == 1.0
+    assert result_by_concept.loc["C0017168", "f1"] == 0.6666666666666666
+    assert result_by_concept.loc["C0020538", "precision"] == 0.5
+    assert result_by_concept.loc["C0020538", "recall"] == 0.25
+    assert result_by_concept.loc["C0020538", "f1"] == 0.3333333333333333
     assert "anchors" not in result
 
 
@@ -117,12 +128,17 @@ def test_sanity_check_model_with_trainer_export_file(model_service):
 
     with open(path, "r") as file:
         result = sanity_check_model_with_trainer_export(file, model_service, return_df=True)
-
-        assert set(result["concept"].to_list()) == {"C0020538", "C0017168"}
-        assert set(result["name"].to_list()) == {"gastroesophageal reflux", "hypertension"}
-        assert set(result["precision"].to_list()) == {0.5, 0.5}
-        assert set(result["recall"].to_list()) == {0.25, 1.0}
-        assert set(result["f1"].to_list()) == {0.3333333333333333, 0.6666666666666666}
+        result_by_concept = result.set_index("concept")
+
+        assert set(result["concept"].to_list()) == EXPECTED_SANITY_CHECK_CONCEPTS
+        assert result_by_concept.loc["C0017168", "name"] == "gastroesophageal reflux"
+        assert result_by_concept.loc["C0020538", "name"] == "hypertension"
+        assert result_by_concept.loc["C0017168", "precision"] == 0.5
+        assert result_by_concept.loc["C0017168", "recall"] == 1.0
+        assert result_by_concept.loc["C0017168", "f1"] == 0.6666666666666666
+        assert result_by_concept.loc["C0020538", "precision"] == 0.5
+        assert result_by_concept.loc["C0020538", "recall"] == 0.25
+        assert result_by_concept.loc["C0020538", "f1"] == 0.3333333333333333
         assert "anchors" not in result
 
 
@@ -152,12 +168,17 @@ def test_sanity_check_model_with_trainer_export_dict(model_service):
 
     with open(path, "r") as file:
         result = sanity_check_model_with_trainer_export(json.load(file), model_service, return_df=True)
-
-        assert set(result["concept"].to_list()) == {"C0020538", "C0017168"}
-        assert set(result["name"].to_list()) == {"gastroesophageal reflux", "hypertension"}
-        assert set(result["precision"].to_list()) == {0.5, 0.5}
-        assert set(result["recall"].to_list()) == {0.25, 1.0}
-        assert set(result["f1"].to_list()) == {0.3333333333333333, 0.6666666666666666}
+        result_by_concept = result.set_index("concept")
+
+        assert set(result["concept"].to_list()) == EXPECTED_SANITY_CHECK_CONCEPTS
+        assert result_by_concept.loc["C0017168", "name"] == "gastroesophageal reflux"
+        assert result_by_concept.loc["C0020538", "name"] == "hypertension"
+        assert result_by_concept.loc["C0017168", "precision"] == 0.5
+        assert result_by_concept.loc["C0017168", "recall"] == 1.0
+        assert result_by_concept.loc["C0017168", "f1"] == 0.6666666666666666
+        assert result_by_concept.loc["C0020538", "precision"] == 0.5
+        assert result_by_concept.loc["C0020538", "recall"] == 0.25
+        assert result_by_concept.loc["C0020538", "f1"] == 0.3333333333333333
         assert "anchors" not in result
 
 
@@ -186,13 +207,19 @@ def test_evaluate_model_and_include_anchors(model_service):
     path = os.path.join(os.path.join(os.path.dirname(__file__), "..", "..", "resources"), "fixture", "trainer_export.json")
 
     result = sanity_check_model_with_trainer_export(path, model_service, return_df=True, include_anchors=True)
+    result_by_concept = result.set_index("concept")
 
-    assert set(result["concept"].to_list()) == {"C0020538", "C0017168"}
-    assert set(result["name"].to_list()) == {"gastroesophageal reflux", "hypertension"}
-    assert set(result["precision"].to_list()) == {0.5, 0.5}
-    assert set(result["recall"].to_list()) == {0.25, 1.0}
-    assert set(result["f1"].to_list()) == {0.3333333333333333, 0.6666666666666666}
-    assert set(result["anchors"].to_list()) == {"P14/D3204/S255/E267;P14/D3205/S255/E267", "P14/D3204/S332/E355;P14/D3205/S332/E355"}
+    assert set(result["concept"].to_list()) == EXPECTED_SANITY_CHECK_CONCEPTS
+    assert result_by_concept.loc["C0017168", "name"] == "gastroesophageal reflux"
+    assert result_by_concept.loc["C0020538", "name"] == "hypertension"
+    assert result_by_concept.loc["C0017168", "precision"] == 0.5
+    assert result_by_concept.loc["C0017168", "recall"] == 1.0
+    assert result_by_concept.loc["C0017168", "f1"] == 0.6666666666666666
+    assert result_by_concept.loc["C0020538", "precision"] == 0.5
+    assert result_by_concept.loc["C0020538", "recall"] == 0.25
+    assert result_by_concept.loc["C0020538", "f1"] == 0.3333333333333333
+    assert result_by_concept.loc["C0020538", "anchors"] == "P14/D3204/S255/E267;P14/D3205/S255/E267"
+    assert result_by_concept.loc["C0017168", "anchors"] == "P14/D3204/S332/E355;P14/D3205/S332/E355"
 
 
 def test_concat_trainer_exports():
@@ -276,10 +303,26 @@ def test_get_stats_from_trainer_export_as_dataframe():
 def test_get_iaa_scores_per_concept():
     path = os.path.join(os.path.join(os.path.dirname(__file__), "..", "..", "resources"), "fixture", "trainer_export_multi_projs.json")
     per_cui_anno_iia_pct, per_cui_anno_cohens_kappa, per_cui_metaanno_iia_pct, per_cui_metaanno_cohens_kappa = get_iaa_scores_per_concept(path, 1, 2)
-    assert set(per_cui_anno_iia_pct.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0338614", "C0878544"}
-    assert set(per_cui_anno_cohens_kappa.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0338614", "C0878544"}
-    assert set(per_cui_metaanno_iia_pct.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0338614", "C0878544"}
-    assert set(per_cui_metaanno_cohens_kappa.keys()) == {"C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860", "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284", "C0038454", "C0042029", "C0155626", "C0338614", "C0878544"}
+    assert set(per_cui_anno_iia_pct.keys()) == {
+        "C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860",
+        "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284",
+        "C0038454", "C0042029", "C0155626", "C0338614", "C0878544",
+    }
+    assert set(per_cui_anno_cohens_kappa.keys()) == {
+        "C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860",
+        "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284",
+        "C0038454", "C0042029", "C0155626", "C0338614", "C0878544",
+    }
+    assert set(per_cui_metaanno_iia_pct.keys()) == {
+        "C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860",
+        "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284",
+        "C0038454", "C0042029", "C0155626", "C0338614", "C0878544",
+    }
+    assert set(per_cui_metaanno_cohens_kappa.keys()) == {
+        "C0003864", "C0007222", "C0007787", "C0010068", "C0011849", "C0011860",
+        "C0012634", "C0017168", "C0020473", "C0020538", "C0027051", "C0037284",
+        "C0038454", "C0042029", "C0155626", "C0338614", "C0878544",
+    }
 
 
 def test_get_iaa_scores_per_concept_and_return_dataframe():
diff --git a/tests/app/processors/test_prefix_cache.py b/tests/app/processors/test_prefix_cache.py
new file mode 100644
index 00000000..9f5a2415
--- /dev/null
+++ b/tests/app/processors/test_prefix_cache.py
@@ -0,0 +1,110 @@
+import torch
+import pytest
+from app.processors.prefix_cache import PrefixCache
+
+
+class _DummyOutputs:
+    def __init__(self, past_key_values):
+        self.past_key_values = past_key_values
+
+
+def _model(past_key_values):
+    class _Model:
+        device = torch.device("cpu")
+
+        def __call__(self, **_kwargs):
+            return _DummyOutputs(past_key_values)
+
+    return _Model()
+
+
+@pytest.fixture
+def tokenizer():
+    class _TokenBatch:
+        def __init__(self):
+            self.input_ids = torch.tensor([[1, 2, 3]])
+            self.attention_mask = torch.tensor([[1, 1, 1]])
+
+        def to(self, _device):
+            return self
+
+    class _Tokenizer:
+        def __call__(self, _text, add_special_tokens=False, return_tensors="pt", padding=False):
+            return _TokenBatch()
+
+    return _Tokenizer()
+
+
+def test_prefix_cache_get_prefix_entry(tokenizer):
+    cache = PrefixCache(max_entries=2)
+    past = ((torch.zeros(1, 2, 3), torch.ones(1, 2, 3)),)
+    model = _model(past)
+
+    first = cache.get_prefix_entry("system prompt", model, tokenizer)
+    second = cache.get_prefix_entry("system prompt", model, tokenizer)
+
+    assert first is not None
+    assert second is not None
+    assert first.past_key_values is second.past_key_values
+
+
+def test_prefix_cache_returns_none_with_no_hit(tokenizer):
+    cache = PrefixCache()
+    model = _model(None)
+
+    entry = cache.get_prefix_entry("system prompt", model, tokenizer)
+
+    assert entry is None
+
+
+def test_expand_past_key_values():
+    past = (
+        (
+            torch.arange(6, dtype=torch.float32).reshape(1, 2, 3),
+            torch.ones(1, 2, 3),
+        ),
+    )
+
+    expanded = PrefixCache.expand_past_key_values(past, batch_size=3)
+
+    assert expanded[0][0].shape[0] == 3
+    assert expanded[0][1].shape[0] == 3
+
+    for b in range(3):
+        assert torch.equal(expanded[0][0][b], past[0][0][0])
+        assert torch.equal(expanded[0][1][b], past[0][1][0])
+
+    assert expanded[0][0].data_ptr() != past[0][0].data_ptr()
+    assert expanded[0][1].data_ptr() != past[0][1].data_ptr()
+
+    t0 = expanded[0][0][0]
+    t1 = expanded[0][0][1]
+    t2 = expanded[0][0][2]
+
+    assert t0.data_ptr() != t1.data_ptr()
+    assert t0.data_ptr() != t2.data_ptr()
+    assert t1.data_ptr() != t2.data_ptr()
+
+    t0_clone = t0.clone()
+    expanded[0][0][0, 0, 0] += 999
+
+    assert torch.equal(past[0][0][0], torch.arange(6).reshape(2, 3))
+    assert torch.equal(expanded[0][0][1], t1)
+    assert torch.equal(expanded[0][0][2], t2)
+    assert not torch.equal(expanded[0][0][0], t0_clone)
+
+
+def test_expand_past_key_values_batch_size_one():
+    past = (
+        (
+            torch.arange(6, dtype=torch.float32).reshape(1, 2, 3),
+            torch.ones(1, 2, 3),
+        ),
+    )
+
+    expanded = PrefixCache.expand_past_key_values(past, batch_size=1)
+
+    assert torch.equal(expanded[0][0], past[0][0])
+    assert torch.equal(expanded[0][1], past[0][1])
+    assert expanded[0][0].data_ptr() != past[0][0].data_ptr()
+    assert expanded[0][1].data_ptr() != past[0][1].data_ptr()
diff --git a/tests/app/processors/test_tagging.py b/tests/app/processors/test_tagging.py
index 83ab9078..2ff6907f 100644
--- a/tests/app/processors/test_tagging.py
+++ b/tests/app/processors/test_tagging.py
@@ -40,14 +40,14 @@ def test_aggregate_bioes_predictions_single_token_entities(self):
 
         assert len(result) == 2
         assert result[0]["entity_group"] == "DISEASE"
-        assert result[0]["label_name"] == "DISEASE"
+        assert result[0]["label_name"] == "Disease"
         assert result[0]["start"] == 0
         assert result[0]["end"] == 7
         assert result[0]["text"] == "Disease"
         assert result[0]["score"] == 0.9
         assert result[0]["accuracy"] == 0.9
         assert result[1]["entity_group"] == "MEDICATION"
-        assert result[1]["label_name"] == "MEDICATION"
+        assert result[1]["label_name"] == "Medication"
         assert result[1]["start"] == 12
         assert result[1]["end"] == 20
         assert result[1]["text"] == "medicine"
@@ -67,7 +67,7 @@ def test_aggregate_bioes_predictions_multi_token_entities(self):
 
         assert len(result) == 1
         assert result[0]["entity_group"] == "DISEASE"
-        assert result[0]["label_name"] == "DISEASE"
+        assert result[0]["label_name"] == "Disease"
         assert result[0]["start"] == 0
         assert result[0]["end"] == 18
         assert result[0]["text"] == "Heart disease and "
@@ -342,6 +342,7 @@ def test_generate_chuncks_iob_scheme(self, mock_model):
         assert new_tokenized["labels"][6] == 5
         assert new_tokenized["labels"][7] == 6
 
+
     def test_generate_chuncks_iobes_scheme(self, mock_model):
         annotations = [{"start": 5, "end": 15, "cui": "DISEASE"}]
         tokenized = {
diff --git a/tests/app/processors/test_viterbi_decoder.py b/tests/app/processors/test_viterbi_decoder.py
new file mode 100644
index 00000000..c351c767
--- /dev/null
+++ b/tests/app/processors/test_viterbi_decoder.py
@@ -0,0 +1,85 @@
+from app.processors.viterbi_decoder import ViterbiDecoder
+
+
+def test_from_id2label_iob() -> None:
+    id2label = {
+        0: "O",
+        1: "B-LABEL",
+        2: "I-LABEL",
+        3: "E-LABEL",
+        4: "S-LABEL",
+    }
+
+    decoder = ViterbiDecoder.from_id2label(
+        id2label,
+        viterbi_biases={"transition_bias_background_to_start": 1.5},
+    )
+
+    assert isinstance(decoder, ViterbiDecoder)
+    assert decoder.label_info.background_token_label == 0
+    assert decoder.label_info.boundary_label_lookup["LABEL"]["B"] == 1
+    assert decoder.transition_bias_background_to_start == 1.5
+
+
+def test_from_id2label_iobes() -> None:
+    id2label = {
+        0: "O",
+        1: "B-LABEL",
+        2: "I-LABEL",
+    }
+
+    decoder = ViterbiDecoder.from_id2label(
+        id2label,
+        viterbi_biases={"transition_bias_background_to_start": 1.5},
+    )
+
+    assert isinstance(decoder, ViterbiDecoder)
+    assert decoder.label_info.background_token_label == 0
+    assert decoder.label_info.boundary_label_lookup["LABEL"]["B"] == 1
+    assert decoder.transition_bias_background_to_start == 1.5
+
+
+def test_apply_viterbi_to_hf_pipeline_output_iob() -> None:
+    id2label = {
+        0: "O",
+        1: "B-LABEL",
+        2: "I-LABEL",
+    }
+    decoder = ViterbiDecoder.from_id2label(id2label)
+    decoder.decode = lambda _: [1, 0]  # type: ignore[method-assign]
+
+    pipeline_output = [
+        {"entity": "LABEL", "score": 0.9, "index": 0, "start": 0, "end": 5},
+        {"entity": "B-LABEL", "score": 0.8, "index": 1, "start": 6, "end": 10},
+    ]
+
+    corrected = decoder.apply_viterbi_to_hf_pipeline_output(pipeline_output, id2label)
+
+    assert corrected[0]["entity"] == "B-LABEL"
+    assert corrected[1]["entity"] == "O"
+    assert corrected[0]["start"] == 0
+    assert corrected[1]["end"] == 10
+
+
+def test_apply_viterbi_to_hf_pipeline_output_iobes() -> None:
+    id2label = {
+        0: "O",
+        1: "B-LABEL",
+        2: "I-LABEL",
+        3: "E-LABEL",
+        4: "S-LABEL",
+    }
+    decoder = ViterbiDecoder.from_id2label(id2label)
+    decoder.decode = lambda _: [4, 0]  # type: ignore[method-assign]
+
+    pipeline_output = [
+        {"entity": "LABEL", "score": 0.9, "index": 0, "start": 0, "end": 5},
+        {"entity": "B-LABEL", "score": 0.8, "index": 1, "start": 6, "end": 10},
+    ]
+
+    corrected = decoder.apply_viterbi_to_hf_pipeline_output(pipeline_output, id2label)
+
+    assert corrected[0]["entity"] == "S-LABEL"
+    assert corrected[1]["entity"] == "O"
+    assert corrected[0]["start"] == 0
+    assert corrected[1]["end"] == 10
diff --git a/tests/app/test_utils.py b/tests/app/test_utils.py
index 5ec81dea..7e46e59d 100644
--- a/tests/app/test_utils.py
+++ b/tests/app/test_utils.py
@@ -7,6 +7,7 @@
 import tarfile
 import pytest
 import unittest
+from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 from safetensors.torch import save_file
 from transformers import PreTrainedModel, PreTrainedTokenizer
@@ -38,6 +39,14 @@
     get_prompt_from_messages,
     utilise_local_chat_template,
     ensure_pad_token,
+    extract_tool_calls,
+    extract_json_string,
+    has_turing_generation_gpu,
+    resolve_safe_max_model_length,
+    quantize_and_save_model,
+    parse_label_into_id_and_name,
+    freeze_hf_model_params_by_names,
+    save_model_to_clean_directory,
 )
 from app.exception import ManagedModelException
 from app.domain import Annotation, Entity, PromptMessage, PromptRole
@@ -92,17 +101,73 @@ def test_send_gelf_message(mocker):
 def test_get_func_params_as_dict():
     def func(arg1, arg2=None, arg3="arg3"):
         pass
+
     params = get_func_params_as_dict(func)
     assert params == {"arg2": None, "arg3": "arg3"}
 
 
+def test_extract_json_string():
+    text = '<any> Before extraction |any| {  "temperature": 15 , "city" :  "London"} ! # <any>'
+    result = extract_json_string(text)
+    assert result == '{"temperature":15,"city":"London"}'
+
+
+def test_extract_json_string_malformed():
+    partial = '<any> Before extraction |any| {\n  "temperature": 15 ,\t "city" :\r ! # <any>'
+    result = extract_json_string(partial)
+    assert result == '{\n  "temperature": 15 ,\t "city" :\r ! # <any>'
+
+    no_json_string = " No JSON string included "
+    result = extract_json_string(no_json_string)
+    assert result == "No JSON string included"
+
+
+def test_parse_label_into_id_and_name():
+    label_id, label_name = parse_label_into_id_and_name("C1234|Pretty Name")
+    assert label_id == "C1234"
+    assert label_name == "Pretty Name"
+
+    label_id, label_name = parse_label_into_id_and_name("C1234:Pretty Name", delimiter=":")
+    assert label_id == "C1234"
+    assert label_name == "Pretty Name"
+
+    label_id, label_name = parse_label_into_id_and_name("no_delimiter-detected")
+    assert label_id == "no_delimiter-detected"
+    assert label_name == "No Delimiter Detected"
+
+
 def test_json_normalize_medcat_entities():
     medcat_entities_path = os.path.join(os.path.dirname(__file__), "..", "resources", "fixture", "medcat_entities.json")
     with open(medcat_entities_path, "r") as f:
         medcat_entities = json.load(f)
     df = json_normalize_medcat_entities(medcat_entities)
     assert len(df) == 25
-    assert df.columns.tolist() == ["pretty_name", "cui", "type_ids", "types", "source_value", "detected_name", "acc", "context_similarity", "start", "end", "icd10", "opcs4", "ontologies", "snomed", "id", "meta_anns.Presence.value", "meta_anns.Presence.confidence", "meta_anns.Presence.name", "meta_anns.Subject.value", "meta_anns.Subject.confidence", "meta_anns.Subject.name", "meta_anns.Time.value", "meta_anns.Time.confidence", "meta_anns.Time.name"]
+    assert df.columns.tolist() == [
+        "pretty_name",
+        "cui",
+        "type_ids",
+        "types",
+        "source_value",
+        "detected_name",
+        "acc",
+        "context_similarity",
+        "start",
+        "end",
+        "icd10",
+        "opcs4",
+        "ontologies",
+        "snomed",
+        "id",
+        "meta_anns.Presence.value",
+        "meta_anns.Presence.confidence",
+        "meta_anns.Presence.name",
+        "meta_anns.Subject.value",
+        "meta_anns.Subject.confidence",
+        "meta_anns.Subject.name",
+        "meta_anns.Time.value",
+        "meta_anns.Time.confidence",
+        "meta_anns.Time.name",
+    ]
 
 
 def test_json_normalize_trainer_export():
@@ -111,7 +176,34 @@ def test_json_normalize_trainer_export():
         trainer_export = json.load(f)
     df = json_normalize_trainer_export(trainer_export)
     assert len(df) == 30
-    assert df.columns.tolist() == ["id", "user", "cui", "value", "start", "end", "validated", "correct", "deleted", "alternative", "killed", "last_modified", "manually_created", "acc", "meta_anns.Status.name", "meta_anns.Status.value", "meta_anns.Status.acc", "meta_anns.Status.validated", "projects.name", "projects.id", "projects.cuis", "projects.tuis", "projects.documents.id", "projects.documents.name", "projects.documents.text", "projects.documents.last_modified"]
+    assert df.columns.tolist() == [
+        "id",
+        "user",
+        "cui",
+        "value",
+        "start",
+        "end",
+        "validated",
+        "correct",
+        "deleted",
+        "alternative",
+        "killed",
+        "last_modified",
+        "manually_created",
+        "acc",
+        "meta_anns.Status.name",
+        "meta_anns.Status.value",
+        "meta_anns.Status.acc",
+        "meta_anns.Status.validated",
+        "projects.name",
+        "projects.id",
+        "projects.cuis",
+        "projects.tuis",
+        "projects.documents.id",
+        "projects.documents.name",
+        "projects.documents.text",
+        "projects.documents.last_modified",
+    ]
 
 
 def test_json_denormalize():
@@ -141,11 +233,16 @@ def test_filter_by_concept_ids():
 def test_replace_spans_of_concept():
     def transform(source: str) -> str:
         return source.upper()[:-7]
+
     trainer_export_path = os.path.join(os.path.dirname(__file__), "..", "resources", "fixture", "trainer_export.json")
     with open(trainer_export_path, "r") as f:
         trainer_export = json.load(f)
     result = replace_spans_of_concept(trainer_export, "C0017168", transform)
-    updated = [(anno["value"], anno["start"], anno["end"]) for anno in result["projects"][0]["documents"][0]["annotations"] if anno["cui"] == "C0017168"]
+    updated = [
+        (anno["value"], anno["start"], anno["end"])
+        for anno in result["projects"][0]["documents"][0]["annotations"]
+        if anno["cui"] == "C0017168"
+    ]
     assert updated[0][0] == "GASTROESOPHAGEAL"
     assert updated[0][1] == 332
     assert updated[0][2] == 348
@@ -197,27 +294,53 @@ def test_augment_annotations_case_insensitive():
     trainer_export_path = os.path.join(os.path.dirname(__file__), "..", "resources", "fixture", "trainer_export.json")
     with open(trainer_export_path, "r") as f:
         trainer_export = json.load(f)
-    result = augment_annotations(trainer_export, {
-        "00001": [["HiSToRy"]],
-        "00002": [
-            [r"^\d{1,2}\s*$", r"-", r"^\s*\d{1,2}\s*$", r"-", r"^\s*\d{2,4}$"],
-            [r"^\d{1,2}\s*[.\/]\s*\d{1,2}\s*[.\/]\s*\d{2,4}$"],
-            [r"^\d{2,4}\s*$", r"-", r"^\s*\d{1,2}\s*$", r"-", r"^\s*\d{1,2}$"],
-            [r"^\d{2,4}\s*[.\/]\s*\d{1,2}\s*[.\/]\s*\d{1,2}$"],
-            [r"^\d{1,2}$", r"^[-.\/]$", r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{2,4}$"],
-            [r"^\d{2,4}$", r"^[-.\/]$", r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{1,2}$"],
-            [r"^\d{1,2}\s*$", r"-", r"^\s*\d{4}$"],
-            [r"^\d{1,2}\s*[\/]\s*\d{4}$"],
-            [r"^\d{4}\s*$", r"-", r"^\s*\d{1,2}$"],
-            [r"^\d{4}\s*[\/]\s*\d{1,2}$"],
-            [r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{4}$"],
-            [r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)(\s+\d{1,2})*$", r",", r"^\d{4}$"],
-            [r"^\d{4}\s*[-.\/]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$"],
-            [r"^\d{4}$", r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$"],
-            [r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$", r"^\d{4}$"],
-            [r"^(?:19\d\d|20\d\d)$"],
-        ]
-    }, case_sensitive=False)
+    result = augment_annotations(
+        trainer_export,
+        {
+            "00001": [["HiSToRy"]],
+            "00002": [
+                [r"^\d{1,2}\s*$", r"-", r"^\s*\d{1,2}\s*$", r"-", r"^\s*\d{2,4}$"],
+                [r"^\d{1,2}\s*[.\/]\s*\d{1,2}\s*[.\/]\s*\d{2,4}$"],
+                [r"^\d{2,4}\s*$", r"-", r"^\s*\d{1,2}\s*$", r"-", r"^\s*\d{1,2}$"],
+                [r"^\d{2,4}\s*[.\/]\s*\d{1,2}\s*[.\/]\s*\d{1,2}$"],
+                [
+                    r"^\d{1,2}$",
+                    r"^[-.\/]$",
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{2,4}$",
+                ],
+                [
+                    r"^\d{2,4}$",
+                    r"^[-.\/]$",
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{1,2}$",
+                ],
+                [r"^\d{1,2}\s*$", r"-", r"^\s*\d{4}$"],
+                [r"^\d{1,2}\s*[\/]\s*\d{4}$"],
+                [r"^\d{4}\s*$", r"-", r"^\s*\d{1,2}$"],
+                [r"^\d{4}\s*[\/]\s*\d{1,2}$"],
+                [
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)\s*[-.\/]\s*\d{4}$"
+                ],
+                [
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)(\s+\d{1,2})*$",
+                    r",",
+                    r"^\d{4}$",
+                ],
+                [
+                    r"^\d{4}\s*[-.\/]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$"
+                ],
+                [
+                    r"^\d{4}$",
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$",
+                ],
+                [
+                    r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|June|July|August|September|October|November|December)$",
+                    r"^\d{4}$",
+                ],
+                [r"^(?:19\d\d|20\d\d)$"],
+            ],
+        },
+        case_sensitive=False,
+    )
 
     match_count_00001 = 0
     match_count_00002 = 0
@@ -450,10 +573,62 @@ def test_get_prompt_with_chat_template():
             PromptMessage(content="Alright?", role=PromptRole.USER.value),
             PromptMessage(content="Yeah.", role=PromptRole.ASSISTANT.value),
         ]
+        tools = [
+            {
+                "name": "get_weather",
+                "description": "Get weather for a city",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"city": {"type": "string"}},
+                    "required": ["city"],
+                },
+            }
+        ]
 
-        prompt = get_prompt_from_messages(mock_tokenizer, messages)
+        prompt = get_prompt_from_messages(mock_tokenizer, messages, tools=tools)
 
         assert prompt == "Mock chat template applied"
+        mock_tokenizer.apply_chat_template.assert_called_once()
+        call_kwargs = mock_tokenizer.apply_chat_template.call_args.kwargs
+        assert call_kwargs["tools"] == tools
+        assert call_kwargs["tokenize"] is False
+        assert call_kwargs["add_generation_prompt"] is True
+        assert call_kwargs["enable_thinking"] is False
+
+
+def test_extract_tool_calls_gpt_oss():
+    text = (
+        "some text"
+        'functions.get_weather json\n{"city":"London"} \n'
+        'functions.get_date json{"city":"London"} \n'
+        "some other text"
+    )
+    tool_calls = extract_tool_calls(text)
+
+    assert tool_calls
+    assert tool_calls[0]["type"] == "function"
+    assert tool_calls[0]["function"]["name"] == "get_weather"
+    assert json.loads(tool_calls[0]["function"]["arguments"]) == {"city": "London"}
+    assert tool_calls[1]["function"]["name"] == "get_date"
+    assert json.loads(tool_calls[1]["function"]["arguments"]) == {"city": "London"}
+
+
+def test_extract_tool_calls_mistral_instruct():
+    text = (
+        "[TOOL_CALLS]["
+        '{"name":"get_weather","arguments":{"city":"London"},"id":"call_1234"},'
+        '{"name":"get_date","arguments":{"city":"London"},"id":"call_5678"}'
+        "]</s>"
+    )
+    tool_calls = extract_tool_calls(text)
+
+    assert tool_calls[0]["type"] == "function"
+    assert tool_calls[0]["id"] == "call_1234"
+    assert tool_calls[0]["function"]["name"] == "get_weather"
+    assert json.loads(tool_calls[0]["function"]["arguments"]) == {"city": "London"}
+    assert tool_calls[1]["id"] == "call_5678"
+    assert tool_calls[1]["function"]["name"] == "get_date"
+    assert json.loads(tool_calls[1]["function"]["arguments"]) == {"city": "London"}
 
 
 def test_get_prompt_with_default_chat_template():
@@ -515,3 +690,248 @@ def test_get_prompt_with_no_messages():
         prompt = get_prompt_from_messages(mock_tokenizer, messages)
 
         assert prompt == "\n<|assistant|>\n"
+
+
+def test_get_prompt_truncates_messages_by_token_limit():
+    with patch("transformers.PreTrainedTokenizer") as tok:
+        mock_tokenizer = tok.return_value
+        mock_tokenizer.chat_template = None
+        mock_tokenizer.default_chat_template = None
+
+        def _fake_encode(text: str, add_special_tokens: bool = False):
+            count = 0
+            for token in ("S", "U1", "A1", "T1", "A2", "U2", "A3", "U3"):
+                count += text.count(token)
+            return list(range(count))
+
+        mock_tokenizer.encode.side_effect = _fake_encode
+        messages = [
+            PromptMessage(content="S", role=PromptRole.SYSTEM.value),
+            PromptMessage(content="U1", role=PromptRole.USER.value),
+            PromptMessage(content="A1", role=PromptRole.ASSISTANT.value),
+            PromptMessage(content="T1", role=PromptRole.TOOL.value),
+            PromptMessage(content="A2", role=PromptRole.ASSISTANT.value),
+            PromptMessage(content="U2", role=PromptRole.USER.value),
+            PromptMessage(content="A3", role=PromptRole.ASSISTANT.value),
+            PromptMessage(content="U3", role=PromptRole.USER.value),
+        ]
+
+        prompt = get_prompt_from_messages(mock_tokenizer, messages, max_input_tokens=5)
+
+        assert "S" in prompt
+        assert "A2" in prompt
+        assert "U2" in prompt
+        assert "A3" in prompt
+        assert "U3" in prompt
+        assert "U1" not in prompt
+        assert "A1" not in prompt
+        assert "T1" not in prompt
+
+
+def test_has_turing_generation_gpu():
+    with patch("torch.cuda.is_available", return_value=False):
+        assert has_turing_generation_gpu() is False
+
+    with (
+        patch("torch.cuda.is_available", return_value=True),
+        patch("torch.cuda.get_device_capability", return_value=(8, 0)),
+    ):
+        assert has_turing_generation_gpu() is False
+
+    with (
+        patch("torch.cuda.is_available", return_value=True),
+        patch("torch.cuda.get_device_capability", return_value=(7, 5)),
+    ):
+        assert has_turing_generation_gpu() is True
+
+
+def test_resolve_safe_max_model_length():
+    top_level = SimpleNamespace(max_position_embeddings=8192, text_config=SimpleNamespace(max_position_embeddings=4096))
+    assert resolve_safe_max_model_length(top_level) == 8192
+
+    text_config = SimpleNamespace(
+        max_position_embeddings=None, text_config=SimpleNamespace(max_position_embeddings=16384)
+    )
+    assert resolve_safe_max_model_length(text_config) == 16384
+
+    seq_length = SimpleNamespace(max_position_embeddings=None, text_config=None, seq_length=2048)
+    assert resolve_safe_max_model_length(seq_length) == 2048
+
+    fallback = SimpleNamespace(max_position_embeddings=None, text_config=None, seq_length=None)
+    assert resolve_safe_max_model_length(fallback) == 512
+
+
+def test_quantize_and_save_model_4bit():
+    with (
+        patch("app.utils.AutoModel") as mock_auto_model,
+        patch("app.utils.AutoTokenizer") as mock_auto_tokenizer,
+        patch("app.utils.BitsAndBytesConfig") as mock_bnb_config,
+        patch("app.utils.has_turing_generation_gpu", return_value=True),
+    ):
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_auto_model.from_pretrained.return_value = mock_model
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        result = quantize_and_save_model(
+            "/path/to/input_model", "/path/to/output_model", load_in_4bit=True, load_in_8bit=False
+        )
+
+        mock_bnb_config.assert_called_once()
+        call_kwargs = mock_bnb_config.call_args.kwargs
+        assert call_kwargs["load_in_4bit"] is True
+        assert call_kwargs["bnb_4bit_use_double_quant"] is True
+        mock_model.save_pretrained.assert_called_once_with("/path/to/output_model")
+        mock_tokenizer.save_pretrained.assert_called_once_with("/path/to/output_model")
+        assert result == "/path/to/output_model"
+
+
+def test_quantize_and_save_model_8bit():
+    with (
+        patch("app.utils.AutoModel") as mock_auto_model,
+        patch("app.utils.AutoTokenizer") as mock_auto_tokenizer,
+        patch("app.utils.BitsAndBytesConfig") as mock_bnb_config,
+        patch("app.utils.has_turing_generation_gpu", return_value=False),
+    ):
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_auto_model.from_pretrained.return_value = mock_model
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        result = quantize_and_save_model(
+            "/path/to/input_model", None, load_in_4bit=False, load_in_8bit=True
+        )
+
+        mock_bnb_config.assert_called_once()
+        mock_auto_model.from_pretrained.assert_called_once()
+        mock_auto_tokenizer.from_pretrained.assert_called_once()
+        mock_model.save_pretrained.assert_called_once_with("/path/to/input_model")
+        mock_tokenizer.save_pretrained.assert_called_once_with("/path/to/input_model")
+        assert result == "/path/to/input_model"
+
+
+def test_quantize_and_save_model_exception():
+    with (
+        patch("app.utils.AutoModel") as mock_auto_model,
+        patch("app.utils.BitsAndBytesConfig"),
+    ):
+        mock_auto_model.from_pretrained.side_effect = Exception("Error")
+
+        with pytest.raises(ManagedModelException) as exc_info:
+            quantize_and_save_model("/path/to/input_model", "/path/to/output_model")
+
+        assert "Error during quantisation and saving of the model" in str(exc_info.value)
+
+def test_freeze_hf_model_params_by_names_inclusive():
+    class _DummyModel:
+        def __init__(self) -> None:
+            self._params = {
+                "encoder.layer.0.weight": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.0.bias": torch.nn.Parameter(torch.ones(1)),
+                "classifier.weight": torch.nn.Parameter(torch.ones(1)),
+            }
+
+        def named_parameters(self):
+            for name, param in self._params.items():
+                yield name, param
+
+    model = _DummyModel()
+    frozen_params, total_params = freeze_hf_model_params_by_names(model, "encoder.layer.0,  unknown.layer.0", True)
+
+    assert frozen_params == 2
+    assert total_params == 3
+    assert model._params["encoder.layer.0.weight"].requires_grad is False
+    assert model._params["encoder.layer.0.bias"].requires_grad is False
+    assert model._params["classifier.weight"].requires_grad is True
+
+def test_freeze_hf_model_params_by_names_exclusive():
+    class _DummyModel:
+        def __init__(self) -> None:
+            self._params = {
+                "encoder.layer.0.weight": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.0.bias": torch.nn.Parameter(torch.ones(1)),
+                "classifier.weight": torch.nn.Parameter(torch.ones(1)),
+            }
+
+        def named_parameters(self):
+            for name, param in self._params.items():
+                yield name, param
+
+    model = _DummyModel()
+    frozen_params, total_params = freeze_hf_model_params_by_names(model, "encoder.layer.0,  unknown.layer.0", False)
+
+    assert frozen_params == 1
+    assert total_params == 3
+    assert model._params["encoder.layer.0.weight"].requires_grad is True
+    assert model._params["encoder.layer.0.bias"].requires_grad is True
+    assert model._params["classifier.weight"].requires_grad is False
+
+def test_freeze_hf_model_params_by_name_regex_inclusive():
+    class _DummyModel:
+        def __init__(self) -> None:
+            self._params = {
+                "encoder.layer.0.weight": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.0.bias": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.1.weight": torch.nn.Parameter(torch.ones(1)),
+                "classifier.weight": torch.nn.Parameter(torch.ones(1)),
+            }
+
+        def named_parameters(self):
+            for name, param in self._params.items():
+                yield name, param
+
+    model = _DummyModel()
+    frozen_params, total_params = freeze_hf_model_params_by_names(model, "encoder\\.layer\\.[0-9]+", True)
+
+    assert frozen_params == 3
+    assert total_params == 4
+    assert model._params["encoder.layer.0.weight"].requires_grad is False
+    assert model._params["encoder.layer.0.bias"].requires_grad is False
+    assert model._params["encoder.layer.1.weight"].requires_grad is False
+    assert model._params["classifier.weight"].requires_grad is True
+
+def test_freeze_hf_model_params_by_name_regex_exclusive():
+    class _DummyModel:
+        def __init__(self) -> None:
+            self._params = {
+                "encoder.layer.0.weight": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.0.bias": torch.nn.Parameter(torch.ones(1)),
+                "encoder.layer.1.weight": torch.nn.Parameter(torch.ones(1)),
+                "classifier.weight": torch.nn.Parameter(torch.ones(1)),
+            }
+
+        def named_parameters(self):
+            for name, param in self._params.items():
+                yield name, param
+
+    model = _DummyModel()
+    frozen_params, total_params = freeze_hf_model_params_by_names(model, "encoder\\.layer\\.[0-9]+", False)
+
+    assert frozen_params == 1
+    assert total_params == 4
+    assert model._params["encoder.layer.0.weight"].requires_grad is True
+    assert model._params["encoder.layer.0.bias"].requires_grad is True
+    assert model._params["encoder.layer.1.weight"].requires_grad is True
+    assert model._params["classifier.weight"].requires_grad is False
+
+def test_save_model_to_clean_directory():
+    model = MagicMock()
+    tokenizer = MagicMock()
+
+    with tempfile.TemporaryDirectory() as model_dir:
+        stale_model_path = os.path.join(model_dir, "model.safetensors")
+        stale_adapter_path = os.path.join(model_dir, "adapter_config.json")
+        with open(stale_model_path, "w") as f:
+            f.write("old weights")
+        with open(stale_adapter_path, "w") as f:
+            f.write("old adapter")
+
+        save_model_to_clean_directory(model, tokenizer, model_dir, safe_serialization=True)
+
+        assert not os.path.exists(stale_model_path)
+        assert not os.path.exists(stale_adapter_path)
+        model.save_pretrained.assert_called_once_with(
+            model_dir,
+            safe_serialization=True,
+        )
+        tokenizer.save_pretrained.assert_called_once_with(model_dir)
diff --git a/tests/app/trainers/test_hf_transformer_trainer.py b/tests/app/trainers/test_hf_transformer_trainer.py
index f78b88be..93e7284e 100644
--- a/tests/app/trainers/test_hf_transformer_trainer.py
+++ b/tests/app/trainers/test_hf_transformer_trainer.py
@@ -1,4 +1,5 @@
 import os
+import torch
 from unittest.mock import create_autospec, patch, Mock
 from app.config import Settings
 from app.model_services.huggingface_ner_model import HuggingFaceNerModel
@@ -59,3 +60,131 @@ def test_huggingface_ner_unsupervised_run(mlflow_fixture):
 def test_huggingface_ner_supervised_run(mlflow_fixture):
     with open(os.path.join(data_dir, "trainer_export.json"), "r") as data_file:
         HuggingFaceNerSupervisedTrainer.run(supervised_trainer, {"nepochs": 1, "print_stats": 1}, data_file, 1, "run_id")
+
+
+def test_freeze_all_except_classification_head():
+    class _DummyModule:
+        def __init__(self, params):
+            self._params = params
+
+        def parameters(self):
+            return self._params
+
+    class _DummyModel:
+        def __init__(self) -> None:
+            self.classifier_w = torch.nn.Parameter(torch.ones(1))
+            self.score_w = torch.nn.Parameter(torch.ones(1))
+            self.encoder_w = torch.nn.Parameter(torch.ones(1))
+            self.classifier = _DummyModule([self.classifier_w])
+            self.score = _DummyModule([self.score_w])
+            self._named = [
+                ("encoder.layer.0.weight", self.encoder_w),
+                ("classifier.weight", self.classifier_w),
+                ("score.weight", self.score_w),
+            ]
+
+        def named_parameters(self):
+            for name, param in self._named:
+                yield name, param
+
+    model = _DummyModel()
+    HuggingFaceNerSupervisedTrainer._freeze_params_or_classifier(model, "any,except_classifier")
+
+    assert model.encoder_w.requires_grad is False
+    assert model.classifier_w.requires_grad is True
+    assert model.score_w.requires_grad is True
+
+
+@patch("app.trainers.huggingface_ner_trainer.torch.set_num_threads")
+@patch("app.trainers.huggingface_ner_trainer.os.cpu_count", return_value=8)
+def test_get_training_args_uses_cpu_thread_based_batching(mock_cpu_count, mock_set_num_threads):
+    original_device = supervised_trainer._config.DEVICE
+    supervised_trainer._config.DEVICE = "cpu"
+
+    with patch.object(
+        HuggingFaceNerSupervisedTrainer,
+        "_create_training_arguments",
+        side_effect=lambda **kwargs: kwargs,
+    ):
+        training_args = supervised_trainer._get_training_args(
+            "results",
+            "logs",
+            {"nepochs": 1, "scaling_factor": 3},
+            1,
+        )
+
+    supervised_trainer._config.DEVICE = original_device
+
+    assert training_args["per_device_train_batch_size"] == 4
+    assert training_args["per_device_eval_batch_size"] == 4
+    assert training_args["gradient_accumulation_steps"] == 4
+    assert training_args["eval_accumulation_steps"] == 4
+    mock_set_num_threads.assert_called_once_with(2)
+
+
+@patch("app.trainers.huggingface_ner_trainer.torch.set_num_threads")
+@patch("app.trainers.huggingface_ner_trainer.torch.backends.mps.is_available", return_value=False)
+@patch("app.trainers.huggingface_ner_trainer.torch.cuda.is_available", return_value=True)
+@patch("app.trainers.huggingface_ner_trainer.os.cpu_count", return_value=8)
+def test_get_training_args_uses_cuda_scaling_factor_batching(
+    mock_cpu_count,
+    mock_cuda_available,
+    mock_mps_available,
+    mock_set_num_threads,
+):
+    original_device = supervised_trainer._config.DEVICE
+    supervised_trainer._config.DEVICE = "cuda"
+
+    with patch.object(
+        HuggingFaceNerSupervisedTrainer,
+        "_create_training_arguments",
+        side_effect=lambda **kwargs: kwargs,
+    ):
+        training_args = supervised_trainer._get_training_args(
+            "results",
+            "logs",
+            {"nepochs": 1, "scaling_factor": 3},
+            1,
+        )
+
+    supervised_trainer._config.DEVICE = original_device
+
+    assert training_args["per_device_train_batch_size"] == 6
+    assert training_args["per_device_eval_batch_size"] == 6
+    assert training_args["gradient_accumulation_steps"] == 3
+    assert training_args["eval_accumulation_steps"] == 3
+    mock_set_num_threads.assert_called_once_with(4)
+
+
+@patch("app.trainers.huggingface_ner_trainer.torch.set_num_threads")
+@patch("app.trainers.huggingface_ner_trainer.torch.backends.mps.is_available", return_value=True)
+@patch("app.trainers.huggingface_ner_trainer.torch.cuda.is_available", return_value=False)
+@patch("app.trainers.huggingface_ner_trainer.os.cpu_count", return_value=8)
+def test_get_training_args_caps_mps_batch_size(
+    mock_cpu_count,
+    mock_cuda_available,
+    mock_mps_available,
+    mock_set_num_threads,
+):
+    original_device = supervised_trainer._config.DEVICE
+    supervised_trainer._config.DEVICE = "mps"
+
+    with patch.object(
+        HuggingFaceNerSupervisedTrainer,
+        "_create_training_arguments",
+        side_effect=lambda **kwargs: kwargs,
+    ):
+        training_args = supervised_trainer._get_training_args(
+            "results",
+            "logs",
+            {"nepochs": 1, "scaling_factor": 10},
+            1,
+        )
+
+    supervised_trainer._config.DEVICE = original_device
+
+    assert training_args["per_device_train_batch_size"] == 8
+    assert training_args["per_device_eval_batch_size"] == 8
+    assert training_args["gradient_accumulation_steps"] == 2
+    assert training_args["eval_accumulation_steps"] == 2
+    mock_set_num_threads.assert_called_once_with(4)
diff --git a/tests/integration/features/serving_llm.feature b/tests/integration/features/serving_llm.feature
index 44050c5c..74a8d9cb 100644
--- a/tests/integration/features/serving_llm.feature
+++ b/tests/integration/features/serving_llm.feature
@@ -15,4 +15,151 @@ Feature:
     When I send a POST request with the following prompt
       | endpoint         | prompt                   | content_type |
       | /stream/generate | What is spinal stenosis? | text/plain   |
-    Then the response should contain generated text stream
\ No newline at end of file
+    Then the response should contain generated text stream
+
+  @openai-models
+  Scenario: List OpenAI-compatible models
+    Given CMS LLM app is up and running
+    When I send a GET request to endpoint
+      | endpoint          |
+      | /openai/v1/models |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key    |
+      | object |
+      | data   |
+
+  @openai-chat
+  Scenario: Create OpenAI-compatible chat completion
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint                    | body                                                                                                                                                                             |
+      | /openai/v1/chat/completions | {"messages":[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"What is spinal stenosis?"}],"model":"test_model","stream":false} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key     |
+      | object  |
+      | choices |
+
+  @openai-completions
+  Scenario: Create OpenAI-compatible completion
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint               | body                                                                                 |
+      | /openai/v1/completions | {"model":"test_model","prompt":"What is spinal stenosis?","stream":false} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key     |
+      | object  |
+      | choices |
+
+  @openai-embeddings
+  Scenario: Create OpenAI-compatible embeddings
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint              | body                                                          |
+      | /openai/v1/embeddings | {"model":"test_model","input":["spinal stenosis"]} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key    |
+      | object |
+      | data   |
+
+  @ollama-tags
+  Scenario: List Ollama-compatible tags
+    Given CMS LLM app is up and running
+    When I send a GET request to endpoint
+      | endpoint         |
+      | /ollama/api/tags |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key    |
+      | models |
+
+  @ollama-health-check @get
+  Scenario: Ollama-compatible health check with GET
+    Given CMS LLM app is up and running
+    When I send a GET request to endpoint
+      | endpoint |
+      | /ollama/ |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key    |
+      | status |
+
+  @ollama-health-check @head
+  Scenario: Ollama-compatible health check with HEAD
+    Given CMS LLM app is up and running
+    When I send a HEAD request to endpoint
+      | endpoint |
+      | /ollama/ |
+    Then the response status code should be 200
+
+  @ollama-version
+  Scenario: Get Ollama-compatible API version
+    Given CMS LLM app is up and running
+    When I send a GET request to endpoint
+      | endpoint            |
+      | /ollama/api/version |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key     |
+      | version |
+
+  @ollama-show
+  Scenario: Show Ollama-compatible model information
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint         | body                                   |
+      | /ollama/api/show | {"model":"test_model"}      |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key        |
+      | modelfile  |
+      | model_info |
+
+  @ollama-generate
+  Scenario: Create Ollama-compatible generation
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint             | body                                                                                 |
+      | /ollama/api/generate | {"model":"test_model","prompt":"What is spinal stenosis?","stream":false} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key      |
+      | response |
+      | done     |
+
+  @ollama-chat
+  Scenario: Create Ollama-compatible chat completion
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint         | body                                                                                                               |
+      | /ollama/api/chat | {"model":"test_model","messages":[{"role":"user","content":"What is spinal stenosis?"}],"stream":false} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key     |
+      | message |
+      | done    |
+
+  @ollama-embed
+  Scenario: Create Ollama-compatible embeddings
+    Given CMS LLM app is up and running
+    When I send a POST request with JSON body
+      | endpoint          | body                                                          |
+      | /ollama/api/embed | {"model":"test_model","input":["spinal stenosis"]} |
+    Then the response status code should be 200
+    And the response content type should contain application/json
+    And the JSON response should include keys
+      | key        |
+      | embeddings |
diff --git a/tests/integration/helper.py b/tests/integration/helper.py
index 2e965ed8..76f52159 100644
--- a/tests/integration/helper.py
+++ b/tests/integration/helper.py
@@ -4,9 +4,11 @@
 import subprocess
 import tempfile
 import threading
+import time
 from functools import partial, wraps
 from pytest_bdd import parsers
 from urllib.parse import urlparse
+import httpx
 from app.domain import ModelType
 from app.utils import download_model_package
 
@@ -56,6 +58,9 @@ def ensure_app_config(debug_mode=False):
     os.environ["DEBUG"] = "true" if debug_mode else "false"
     os.environ["MLFLOW_TRACKING_URI"] = tempfile.TemporaryDirectory().name
     os.environ["PYTHONUNBUFFERED"] = "1"
+    os.environ["PROCESS_RATE_LIMIT"] = "10000/minute"
+    os.environ["PROCESS_BULK_RATE_LIMIT"] = "10000/minute"
+    os.environ["GENERATION_RATE_LIMIT"] = "10000/minute"
 
 
 def get_logger(debug=False, name="cms-integration"):
@@ -102,7 +107,7 @@ def run(conf, logger, streamable=False, generative=False):
 
         def cms_log_listener(pipe, logger, event):
             for line in iter(pipe.readline, ""):
-                if "Application startup complete" in line:
+                if "Uvicorn running on" in line:
                     event.set()
                 logger.info(line[:-1])
             pipe.close()
@@ -112,7 +117,11 @@ def cms_log_listener(pipe, logger, event):
         logging_thread.daemon = True
         logging_thread.start()
         try:
-            startup_event.wait(timeout=60)
+            timeout = 120
+            if not startup_event.wait(timeout=timeout):
+                raise RuntimeError(f"CMS process was not ready within {timeout} seconds")
+            if conf["process"].poll() is not None:
+                raise RuntimeError("CMS process exited before becoming ready")
             return {
                 "base_url": conf["base_url"],
             }
@@ -123,4 +132,26 @@ def cms_log_listener(pipe, logger, event):
         logger.info("CMS server is up and running")
         return {
             "base_url": conf["base_url"],
-        }
\ No newline at end of file
+        }
+
+
+async def wait_for_server_ready(
+    base_url: str,
+    timeout_secs: int = 60,
+    retry_interval_secs: int = 1,
+) -> None:
+    deadline = time.monotonic() + timeout_secs
+    last_error: str = "Unknown"
+    async with httpx.AsyncClient(timeout=5) as client:
+        while time.monotonic() < deadline:
+            for path in ("/healthz", "/readyz"):
+                try:
+                    response = await client.get(f"{base_url}{path}")
+                    if response.status_code < 500:
+                        return
+                    last_error = f"{path} returned status {response.status_code}"
+                except Exception as exc:
+                    last_error = f"{path} connection error: {exc}"
+            await asyncio.sleep(retry_interval_secs)
+
+    raise RuntimeError(f"CMS server was not ready within {timeout_secs}s ({last_error})")
diff --git a/tests/integration/steps/test_llm_steps.py b/tests/integration/steps/test_llm_steps.py
index 07803c6c..4072c58c 100644
--- a/tests/integration/steps/test_llm_steps.py
+++ b/tests/integration/steps/test_llm_steps.py
@@ -2,10 +2,11 @@
 import pytest
 import requests
 import socket
-from pytest_bdd import scenarios, given, when, then
+from pytest_bdd import scenarios, given, when, then, parsers
 from helper import ensure_app_config, get_logger, data_table, run
 
 
+pytestmark = pytest.mark.timeout(600)
 scenarios("../features/serving_llm.feature")
 ensure_app_config(debug_mode=False)
 logger = get_logger(debug=True, name="cms-integration-llm")
@@ -44,6 +45,29 @@ def send_post_request_prompt(context_llm, request):
         headers={"Content-Type": request[0]["content_type"]},
     )
 
+
+@when(data_table("I send a GET request to endpoint", fixture="request", orient="dict"))
+def send_get_request(context_llm, request):
+    context_llm["response"] = requests.get(
+        f"{context_llm['base_url']}{request[0]['endpoint']}",
+    )
+
+
+@when(data_table("I send a HEAD request to endpoint", fixture="request", orient="dict"))
+def send_head_request(context_llm, request):
+    context_llm["response"] = requests.head(
+        f"{context_llm['base_url']}{request[0]['endpoint']}",
+    )
+
+
+@when(data_table("I send a POST request with JSON body", fixture="request", orient="dict"))
+def send_post_request_json(context_llm, request):
+    context_llm["response"] = requests.post(
+        f"{context_llm['base_url']}{request[0]['endpoint']}",
+        data=request[0]["body"],
+        headers={"Content-Type": "application/json"},
+    )
+
 @then("the response should contain generated text")
 def check_response_generated_text(context_llm):
     assert context_llm["response"].headers["Content-Type"] == "text/plain; charset=utf-8"
@@ -56,3 +80,20 @@ def check_response_generated_text_stream(context_llm):
     for line in context_llm["response"].iter_lines(decode_unicode=True):
         buffer += line + '\n'
     assert len(buffer) >= 1
+
+
+@then(parsers.parse("the response status code should be {status_code:d}"))
+def check_response_status_code(context_llm, status_code):
+    assert context_llm["response"].status_code == status_code
+
+
+@then(parsers.parse("the response content type should contain {content_type}"))
+def check_response_content_type_contains(context_llm, content_type):
+    assert content_type in context_llm["response"].headers["Content-Type"]
+
+
+@then(data_table("the JSON response should include keys", fixture="request", orient="dict"))
+def check_json_response_keys(context_llm, request):
+    payload = context_llm["response"].json()
+    for row in request:
+        assert row["key"] in payload
diff --git a/tests/integration/steps/test_steps.py b/tests/integration/steps/test_steps.py
index a302cba8..fe41011b 100644
--- a/tests/integration/steps/test_steps.py
+++ b/tests/integration/steps/test_steps.py
@@ -8,6 +8,7 @@
 from helper import ensure_app_config, get_logger, download_model, data_table, run
 
 
+pytestmark = pytest.mark.timeout(600)
 scenarios("../features/serving.feature")
 ensure_app_config(debug_mode=False)
 logger = get_logger(debug=True)
@@ -228,13 +229,17 @@ def check_response_evaluation_metrics(context):
     assert context["response"].headers["Content-Type"] == "application/json"
     response_json = context["response"].json()
     assert len(response_json) == 1
-    assert "number_of_names" in response_json[0]
-    assert "number_of_seen_training_examples_in_total" in response_json[0]
-    assert "average_training_examples_per_concept" in response_json[0]
+    assert "concepts" in response_json[0]
     assert "per_concept_train_count_after" in response_json[0]
     assert "per_concept_train_count_before" in response_json[0]
-    assert "number_of_concepts_that_received_training" in response_json[0]
-    assert "number_of_concepts" in response_json[0]
+    assert "system/network_receive_megabytes" in response_json[0]
+    assert "system/system_memory_usage_percentage" in response_json[0]
+    assert "system/disk_available_megabytes" in response_json[0]
+    assert "system/system_memory_usage_megabytes" in response_json[0]
+    assert "system/network_transmit_megabytes" in response_json[0]
+    assert "system/cpu_utilization_percentage" in response_json[0]
+    assert "system/disk_usage_megabytes" in response_json[0]
+    assert "system/disk_usage_percentage" in response_json[0]
     context["response"].close()
 
 @then("the response should contain encrypted labels")
diff --git a/tests/integration/steps/test_stream_steps.py b/tests/integration/steps/test_stream_steps.py
index 083740d9..2ecd1bda 100644
--- a/tests/integration/steps/test_stream_steps.py
+++ b/tests/integration/steps/test_stream_steps.py
@@ -4,9 +4,18 @@
 import socket
 import websockets
 from pytest_bdd import scenarios, given, when, then
-from helper import ensure_app_config, get_logger, download_model, data_table, async_to_sync, run
+from helper import (
+    ensure_app_config,
+    get_logger,
+    download_model,
+    data_table,
+    async_to_sync,
+    run,
+    wait_for_server_ready,
+)
 
 
+pytestmark = pytest.mark.timeout(600)
 scenarios("../features/serving_stream.feature")
 ensure_app_config(debug_mode=False)
 logger = get_logger(debug=True, name="cms-integration-stream")
@@ -38,6 +47,7 @@ def cms_stream_is_running(cms_stream):
 @when(data_table("I send an async POST request with the following jsonlines content", fixture="request", orient="dict"))
 @async_to_sync
 async def send_async_post_request(context_stream, request):
+    await wait_for_server_ready(context_stream["base_url"], timeout_secs=90, retry_interval_secs=1)
     async with httpx.AsyncClient(base_url=context_stream["base_url"]) as ac:
         context_stream["response"] = await ac.post(
             f"{context_stream['base_url']}{request[0]['endpoint']}",
@@ -61,6 +71,7 @@ async def check_response_stream(context_stream):
 @when("I send a piece of text to the WS endpoint")
 @async_to_sync
 async def send_ws_request(context_stream):
+    await wait_for_server_ready(context_stream["base_url"], timeout_secs=90, retry_interval_secs=1)
     ws_url = context_stream["base_url"].replace("http", "ws") + "/stream/ws"
     async with websockets.connect(ws_url) as websocket:
         await websocket.send("Spinal stenosis")
@@ -68,4 +79,4 @@ async def send_ws_request(context_stream):
 
 @then("the response should contain annotated spans")
 def check_response_ws(context_stream):
-    assert context_stream["response"].lower() == "[spinal stenosis: spinal stenosis]"
\ No newline at end of file
+    assert context_stream["response"].lower() == "[spinal stenosis: spinal stenosis]"
diff --git a/uv.lock b/uv.lock
index 3aaaea21..3ffc0fd8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -172,6 +172,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
+[[package]]
+name = "anthropic"
+version = "0.71.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "docstring-parser" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/4f/70682b068d897841f43223df82d96ec1d617435a8b759c4a2d901a50158b/anthropic-0.71.0.tar.gz", hash = "sha256:eb8e6fa86d049061b3ef26eb4cbae0174ebbff21affa6de7b3098da857d8de6a", size = 489102, upload-time = "2025-10-16T15:54:40.08Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/77/073e8ac488f335aec7001952825275582fb8f433737e90f24eeef9d878f6/anthropic-0.71.0-py3-none-any.whl", hash = "sha256:85c5015fcdbdc728390f11b17642a65a4365d03b12b799b18b6cc57e71fdb327", size = 355035, upload-time = "2025-10-16T15:54:38.238Z" },
+]
+
 [[package]]
 name = "anyio"
 version = "4.12.0"
@@ -186,6 +205,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
 ]
 
+[[package]]
+name = "apache-tvm-ffi"
+version = "0.1.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ff/95/ef83880657e89a0ce0f1ad79cbff11698286d00522dbc290d34a8458e9c2/apache_tvm_ffi-0.1.12.tar.gz", hash = "sha256:2aa5c8ece3144dad11afd6d0f10191d03cdb368bbcd9c92f9fb919f35906223d", size = 2843816, upload-time = "2026-06-09T18:17:31.68Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/60/7e851d0391d3d39acde7620896255eb1dc289a6dec8d0ced9261929328b0/apache_tvm_ffi-0.1.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cbdadaf5ce64d4c3114b4366a5b685010ffa178f48f8250974e5f1a9b9c81185", size = 2511255, upload-time = "2026-06-09T18:16:17.258Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ec/9e17990a0af9bc6f1621fc07b10868e69040d14fbed5767e8a2eab873836/apache_tvm_ffi-0.1.12-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b17d2480eb2d04d4034669e3bba31527cd1d4900f1f51712cd959f9721bb0beb", size = 2687332, upload-time = "2026-06-09T18:16:19.69Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/a2/71eec92ef1a1bc8f993e742f6b1b8d9adfbd0d7396c8549c2473523077e6/apache_tvm_ffi-0.1.12-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f9500fc9b1b3315d02602382d13ac976aa1466b2332ff05f74810a6d48821cd", size = 2821872, upload-time = "2026-06-09T18:16:21.741Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/9c/d770a3610bedcfac40ce918cc90f3ba90199cdb322d4c6babdcb690c7cb2/apache_tvm_ffi-0.1.12-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d0db8594244d4393ff6b4fa1c161eee5e4f79f2b86137547a2e5ad9a4cbf431", size = 2600672, upload-time = "2026-06-09T18:16:24.027Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/e4/d88df0b0157f16b5feaef513ce99a712a13baa5beb0b514fbf6702440646/apache_tvm_ffi-0.1.12-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de7807573588d8a74aa5897a07252e882a6e9672ba2ad4afcfeafa136142881c", size = 2785316, upload-time = "2026-06-09T18:16:26.092Z" },
+    { url = "https://files.pythonhosted.org/packages/73/eb/fccd0646d28a2d0003625afaa36b9e95fb6b442037d2b49ffc0e4570f7a6/apache_tvm_ffi-0.1.12-cp310-cp310-win_amd64.whl", hash = "sha256:57d75555e6245e20e2eeaef70abaacb80822790ae4651cea747a0621158053a5", size = 2753670, upload-time = "2026-06-09T18:16:28.198Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/4c/720c64fe82121c1b617cc2a63c3a4f9a8a5c32ced22fd448a89644fd675c/apache_tvm_ffi-0.1.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4e22dd0128bd8b671d19b074201e94d39c8a4580822fc153e593741ef7355ff", size = 2508770, upload-time = "2026-06-09T18:16:30.007Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/9b/39dbc81718ea7e7adb6e6326675c065bf0b90f4ebba6dde878a77b792cda/apache_tvm_ffi-0.1.12-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9892c39a037bcf0e4ca0da1693f1193ccc2c0f02b899402e88011847980d98f6", size = 2687418, upload-time = "2026-06-09T18:16:32.005Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/9e/bc4fdd679105d0617ebf6e89036967d3560ab7e4422a22df741b493470b2/apache_tvm_ffi-0.1.12-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c418fea49b9146d692af40f0b655df68870a032041dd27c0f468d646eda5f8bc", size = 2821462, upload-time = "2026-06-09T18:16:34.202Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5c/2a311cf5bb49575cec99de45887bc76aefa43a0cd3a3f29cb71e92c8100c/apache_tvm_ffi-0.1.12-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fec41a0633af57bcae552d662cfd096c57db685b752d1898087ca484f060e9f", size = 2598686, upload-time = "2026-06-09T18:16:36.174Z" },
+    { url = "https://files.pythonhosted.org/packages/93/e9/923843463730aa1add10c26b45110fb6a13a68dfdb48e1cd9e325b04e331/apache_tvm_ffi-0.1.12-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:011e372fb9b169c3bd57f63e03fa1383cee6f1ef47a4932c4ff552f29db281c3", size = 2783994, upload-time = "2026-06-09T18:16:38.13Z" },
+    { url = "https://files.pythonhosted.org/packages/63/d9/de68905c24ea6bc1fbed2021b700b9ca27a2b601fb6db43de37132c21407/apache_tvm_ffi-0.1.12-cp311-cp311-win_amd64.whl", hash = "sha256:06bcc161c020dc83e9db33b63b01c21815eab2cca3ca876748664bacd319cf9c", size = 2755540, upload-time = "2026-06-09T18:16:40.028Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/ef/8f2ea57791e8df55c5a52e20d415c01032ef5fa3761574268201b7cc2c79/apache_tvm_ffi-0.1.12-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:218e55c807d49182710ef2ab0336313ba6becccb7e565f4941d23bded09646d4", size = 2463724, upload-time = "2026-06-09T18:16:41.904Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/c4/34aec1f10353eee555687f3196241457b8e8a06da2014a176f5b022e24bd/apache_tvm_ffi-0.1.12-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:557d8deb672f2ad7f445399e3fa0c727a6e11472e19c895ee244cbb8cfd99a66", size = 2616513, upload-time = "2026-06-09T18:16:44.115Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/2a/bff8d73841b49f196852edd8460241d3a363e6b0d64c3a9367542658394f/apache_tvm_ffi-0.1.12-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:817af52916ca9987e019ae9c811406835c7f26c590b2a7bcfa9db0e3809f4228", size = 2757612, upload-time = "2026-06-09T18:16:45.987Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/20/51d0c31c76bef0f21c11bce0465598d1ea5fdaca22e47a69c29deadeada7/apache_tvm_ffi-0.1.12-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a7b08f377ea2663dae10e3045f8d0215f0378ee975096174a8af6381eeb1504", size = 2533956, upload-time = "2026-06-09T18:16:47.891Z" },
+    { url = "https://files.pythonhosted.org/packages/52/f3/fba607d803cb081be2d66ea51865492b42872898bd271d9bcc3e1ced4ef3/apache_tvm_ffi-0.1.12-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc0acd7eeb0e451d5e3f686af3ba0b495fdbf97b5b54cf9a0f770cdafe0e691a", size = 2719638, upload-time = "2026-06-09T18:16:50.021Z" },
+    { url = "https://files.pythonhosted.org/packages/06/d7/a25f51156358c631114e16cb09ca91188b3b79677369e111216d6fa7f83d/apache_tvm_ffi-0.1.12-cp312-abi3-win_amd64.whl", hash = "sha256:23eefd1094a41faae2bb7b9cc5816aa938101b624d48ebb724881f1a89b78e99", size = 2725953, upload-time = "2026-06-09T18:16:52.215Z" },
+]
+
 [[package]]
 name = "argon2-cffi"
 version = "23.1.0"
@@ -224,6 +272,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/82/82745642d3c46e7cea25e1885b014b033f4693346ce46b7f47483cf5d448/argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:da0c79c23a63723aa5d782250fbf51b768abca630285262fb5144ba5ae01e520", size = 29187, upload-time = "2025-07-30T10:02:03.674Z" },
 ]
 
+[[package]]
+name = "astor"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/21/75b771132fee241dfe601d39ade629548a9626d1d39f333fde31bc46febe/astor-0.8.1.tar.gz", hash = "sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e", size = 35090, upload-time = "2019-12-10T01:50:35.51Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl", hash = "sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5", size = 27488, upload-time = "2019-12-10T01:50:33.628Z" },
+]
+
 [[package]]
 name = "astroid"
 version = "4.0.2"
@@ -298,6 +355,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
 ]
 
+[[package]]
+name = "backports-strenum"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/c7/2ed54c32fed313591ffb21edbd48db71e68827d43a61938e5a0bc2b6ec91/backports_strenum-1.3.1.tar.gz", hash = "sha256:77c52407342898497714f0596e86188bb7084f89063226f4ba66863482f42414", size = 7257, upload-time = "2023-12-09T14:36:40.937Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d6/50/56cf20e2ee5127b603b81d5a69580a1a325083e2b921aa8f067da83927c0/backports_strenum-1.3.1-py3-none-any.whl", hash = "sha256:cdcfe36dc897e2615dc793b7d3097f54d359918fc448754a517e6f23044ccf83", size = 8304, upload-time = "2023-12-09T14:36:39.905Z" },
+]
+
 [[package]]
 name = "bcrypt"
 version = "4.1.2"
@@ -346,6 +412,56 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/69/76/bc6460b1618322258e7d251cd0c9d11d98d5232bb37cd507451e40127f8e/bitsandbytes-0.49.0-py3-none-win_amd64.whl", hash = "sha256:57a327c6d65f7eda32eb8d416ef8e44d2415c2e7b4fdb735896abd04171ae696", size = 54700284, upload-time = "2025-12-11T20:50:49.373Z" },
 ]
 
+[[package]]
+name = "blake3"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/6a/4cc5a9dd40fd8a6d283fd3761e5f59c490109571ef8e3c73245417e5a305/blake3-1.0.9.tar.gz", hash = "sha256:5fa374fa5070ca084368776c19b420157eb0f2d3f091343d6bc59189929d62e2", size = 116872, upload-time = "2026-06-22T18:02:25.366Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/2f/5398493cef29d9f216be1ff74a303e809e4958a633a44545035a98af4f60/blake3-1.0.9-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:38e61d3b0386af16b3c03a18e0db82b626d63796274637a1fef855fd1c778d82", size = 346497, upload-time = "2026-06-22T17:59:57.448Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/4d/8aeca9a40899258353a8f79ad164fba1184bc1554ca18607cab4671952f3/blake3-1.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e9e1d0392624c2f9d049d786f0dc547ce818d2f2b356bcf1c4d74b6f9cc026b4", size = 335390, upload-time = "2026-06-22T17:59:59.162Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/0a/74c67827a9cae097ccab7015018182da9cfec347c686a25ef33faf2f46a1/blake3-1.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8114fb2a1f6cba9cba5411d62cbcb283b2205b154d0076f20b77e22592eb2719", size = 378100, upload-time = "2026-06-22T18:00:00.468Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/8e/cef564771169b6fb429d9c52652dd2da8c9bbadb63d2d66f232f8bf045de/blake3-1.0.9-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b985eb08db76550ec97444e03b10acd737baa03fd98aaf3b8455a1c644c8f5d6", size = 377559, upload-time = "2026-06-22T18:00:01.822Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/92/2df756e410d18aba6fef6392b35b835c76412709739a2cde552d246afa4b/blake3-1.0.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a517f0460007edec3767595115c520ed1f157ddd0ed23dddbf6b9d8b0082afb6", size = 451544, upload-time = "2026-06-22T18:00:03.293Z" },
+    { url = "https://files.pythonhosted.org/packages/88/69/44423d63e7c6d09000ce69784dd9fb45bda93237f1d2f611099f5ffe27c7/blake3-1.0.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dad0a8a716dd201860f8e82011a340e6bdd5ee37a8eb4357b48ac64c4e6de1c2", size = 492654, upload-time = "2026-06-22T18:00:04.638Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/02/7ca45b504796a755bcd765e54f0c6762c16a1dac1adec3a03a45ae9c2f12/blake3-1.0.9-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bca166d0b01c00dcf2a936f790ed947bd9079b0a0a7df1b76746f201aa4f4ac4", size = 387295, upload-time = "2026-06-22T18:00:06.026Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/e4/c8fa46a0e24cb877fbf28f839d8ceda39418259f677ec55d680ea433b62b/blake3-1.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6e5c7533c915a24d840ae4be787e9a6059be7e77944b005b3d967a0257a17d", size = 387632, upload-time = "2026-06-22T18:00:07.349Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/b3/6315be017515868126e106f3dfe50223fbbb87bed67109bfbf883228f505/blake3-1.0.9-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:24acb1e6f31021fa08b7eb31433035facfcf0d82e964170d5eb85a30ce913ba9", size = 384740, upload-time = "2026-06-22T18:00:08.747Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/e8/fe7e40384c0f7995fe8dca57428241768897533b9e17cbc367c1614ef82f/blake3-1.0.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:216977b1d592a60150cd5de64d5853dc6afb0eb522cb387723ae7f78f380d947", size = 553251, upload-time = "2026-06-22T18:00:10.192Z" },
+    { url = "https://files.pythonhosted.org/packages/19/e5/e9ecb843308db2b5ca29d604589a15f50d13c20df792260053bf9f014de4/blake3-1.0.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6f2dd643166dfeb7cf4ad53eb2d801f944d247212d3481950b4d5b4a20551461", size = 595209, upload-time = "2026-06-22T18:00:11.644Z" },
+    { url = "https://files.pythonhosted.org/packages/da/42/201d43f8fee831693f34f7b384a65e41ab7720e6cd8d775cb57d9da69993/blake3-1.0.9-cp310-cp310-win32.whl", hash = "sha256:c755044ba7bec3d03dae44b968194112f0eb0e8c4523465f3dd9e1a87e178d89", size = 231157, upload-time = "2026-06-22T18:00:13.035Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/12/f23a64ba2ef270457345499f857628757fafd83f52274c1588e1b4a5b4c0/blake3-1.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:8cd10c6a421a7d3c81136658e52e9ef58bfcc1df04193466664eb24981784f4c", size = 220829, upload-time = "2026-06-22T18:00:14.298Z" },
+    { url = "https://files.pythonhosted.org/packages/27/12/aa8d72228b6ff61c675bd6f55ab138a91d71499c8a707cc9fb2052f1d2b5/blake3-1.0.9-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f169519c7ef25ef2c446b05e2f08e7e59fae312d569f98a3134b38d4caf7abd4", size = 346253, upload-time = "2026-06-22T18:00:15.537Z" },
+    { url = "https://files.pythonhosted.org/packages/72/3a/820d2f729dfe152d5ebde16390f808c762dce3f21fb764ab033803ff2b1a/blake3-1.0.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b5e1f21b49492d01fa5a02084894c491ab9e7a1867fced107f7126c80d067c94", size = 335497, upload-time = "2026-06-22T18:00:16.942Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/d6/d5462ec19a7f3d084fe327e08618fa107799ee708df04b3a2d620bd62816/blake3-1.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ee96daaa850700fd342a811fa10a8780fd2e8464a71b83a1779c7b6becd3dd5", size = 377621, upload-time = "2026-06-22T18:00:18.389Z" },
+    { url = "https://files.pythonhosted.org/packages/92/98/dbc433f2a45be1b2344a6035d4212dfb6e6eb45046ad15103ead9c82d491/blake3-1.0.9-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:09deb024cd75cb200e7f647cd038800e6edc8f190c8188e0c69ec1c2b920e125", size = 377495, upload-time = "2026-06-22T18:00:20.067Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/3d/c7a699fb60d8ed31f3f28e6aec7658d29e45ec89e7054906b3040ce3ee65/blake3-1.0.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c99afb0459c82dd13e456b6b68d45c4768b539ca998dacd3ed726f1e75e91dc", size = 451158, upload-time = "2026-06-22T18:00:21.459Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/a1/0b1b0dbf2dd772483e372237bb65385602b019e24b67424b1fc9e5447837/blake3-1.0.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:28528d1f29e6f3d45faf3482e1197e5e175730eef38bdc74e56ee11b68e0ad0d", size = 491988, upload-time = "2026-06-22T18:00:22.984Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/d1/ed319477f6d263a4f6b7e9aa465b06be5235a854923edbc9ea09508b6638/blake3-1.0.9-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65c0c20014df687694af5ccf0cec3bdb194511da8ebd50c30b0fd55c83fa4fd5", size = 386848, upload-time = "2026-06-22T18:00:24.319Z" },
+    { url = "https://files.pythonhosted.org/packages/80/3e/a4cfb269f3e0955598b415a7843c358c4f79e826e3c9118dc9fb1f101ee6/blake3-1.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:964b642631a3c8fe117b3439c8ae64a9a0981af9444e409656d1f1e464bfa125", size = 387842, upload-time = "2026-06-22T18:00:25.589Z" },
+    { url = "https://files.pythonhosted.org/packages/59/0e/d4ee3d89eece42f86eb46663aa42702000516b7ffbc53f60b918efe95b57/blake3-1.0.9-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2fd000708662b04be211a22c1095b65fe399d7276e9f3bb2fd1ef8aacc545791", size = 384317, upload-time = "2026-06-22T18:00:26.891Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/aa/317106349d10de3b51332ad1e761f4864ebe887854396b75975304dcfbd1/blake3-1.0.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:82ecade6ac425fdfc39a4371d6d9232fd6e5c28748fd8d3489016ead17407014", size = 553005, upload-time = "2026-06-22T18:00:28.246Z" },
+    { url = "https://files.pythonhosted.org/packages/39/cc/7fbce61a0b24bda1aac99da674bd74ac2b687b61db071c888ffdb30cb47a/blake3-1.0.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b4102ba86b86c992a931b4a88c58a632d6097461e14a1e63ebd2ecb98ff0898f", size = 595086, upload-time = "2026-06-22T18:00:29.96Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/91/6ddc7a8b582a0871f23d6db722f4950a8918096d5fa10f9f0f992c2aea39/blake3-1.0.9-cp311-cp311-win32.whl", hash = "sha256:2f4ce45da903f3d0a7e342fa70c7cce9c10cef6b529eadb4d6213be0ab0eaf84", size = 231230, upload-time = "2026-06-22T18:00:31.247Z" },
+    { url = "https://files.pythonhosted.org/packages/23/68/ea698e6df48eeb417671544cfbb18c60f863cb689306cc52f19666dd98f8/blake3-1.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:d819457dccfd82fe34684ec99e36725f747bd5761a0e17f537387fb31d121193", size = 220622, upload-time = "2026-06-22T18:00:32.495Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/d2/9bdf8345c70993aaef635398f52edfb915d6e8ad2c000c801204e387c456/blake3-1.0.9-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a70c20542d5e7960983a0ff32999049a2b0e5ef1f22dbbbdfb51cf04828a4156", size = 344587, upload-time = "2026-06-22T18:00:34.244Z" },
+    { url = "https://files.pythonhosted.org/packages/36/9d/be8b1f7f85b12bb45a0fade6ca7bdbf83a507d23d0b6141ba29fe69c8cea/blake3-1.0.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:72cdecf088a9d25e6ec79948a578995649b0dbee407e7a46c543a9ecc0f6f281", size = 328864, upload-time = "2026-06-22T18:00:35.59Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/78/66580635d744c826671fd219938caffb16281a26f62c4f856695d4233677/blake3-1.0.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42fa57bf462285ef16400601b0fd32214c248ba92505bbb94b1221ab9af5a092", size = 373795, upload-time = "2026-06-22T18:00:36.887Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/79/b5b17d3004bb81a5732c0b176c812703d200ed8c652b3b7713b9633bbe10/blake3-1.0.9-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b25ccde5a64be070f20e5c7a81da70292db40b164b6c77588cbd6230856badbb", size = 374183, upload-time = "2026-06-22T18:00:38.205Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/63/0d209c44b2041bbe130ced12a23c92dd995fbfe5bce7ee77fffea16f5cb0/blake3-1.0.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a800b87433955f37691b5f361ad29c7dd3ee089c9cd109adc5aea8e24bc4c1f", size = 446783, upload-time = "2026-06-22T18:00:39.493Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/51/efd1f9b8a9d3e9a0e235f3ced99a738529a1019fe78b3988e29d9c2fbba6/blake3-1.0.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6879739e7904b9c42afbedbcc2e8c36cebe140fb3fc3f5c492993579cf5cd516", size = 487369, upload-time = "2026-06-22T18:00:40.875Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/3f/a8dcaea9e0b26e419a540ca0cd6203c9fbb505e85b02b03c5a59bf9e6a45/blake3-1.0.9-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6edeb3d49a24c307995899b70dd47aa901d0e9ad51d2f8a79aba4f074f32d8c5", size = 383845, upload-time = "2026-06-22T18:00:42.251Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/10/e9907f5b86410d5071982aaf05d149ca4d4fd8acab7e77eebbc9a333c7b4/blake3-1.0.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcd56a7a972c4185070f7042ccc20166927eec3c0f98b8405f375d007b604a0b", size = 383851, upload-time = "2026-06-22T18:00:43.715Z" },
+    { url = "https://files.pythonhosted.org/packages/34/cf/c7863a185550706a9624f6aa7b6d46470aaed0bb46a827c5cda2a7d03151/blake3-1.0.9-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:a288664d08dee154cc496e06e62517fc9e655ecec12b0d7db538d244ac79edf1", size = 380067, upload-time = "2026-06-22T18:00:45.249Z" },
+    { url = "https://files.pythonhosted.org/packages/54/0a/e7af679c719368b400c9ba9c3460072aac2ba077ddbd4bc806fef28cda03/blake3-1.0.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:91db52a809b68b5bebe7c413ddcd230e1f759398e7fa7a873104595a4fa648b6", size = 549471, upload-time = "2026-06-22T18:00:46.793Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/3c/37c1dd3539b7bd9b6d2eef019802aacdb4a3d48ab484b140603bbf9c5b5a/blake3-1.0.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cfaa671b07eb73883162ca940442193868358b0b904cfa266e4b74131ce966da", size = 591396, upload-time = "2026-06-22T18:00:48.122Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/55/4f0a23b72795292e74084834130900ea778c0583004519c86698dfffe1a5/blake3-1.0.9-cp312-cp312-win32.whl", hash = "sha256:ae47c3d5729ff89baa6ddf6de47fcfcc915985d39eb1bfcd6db653331f3c6fcc", size = 229271, upload-time = "2026-06-22T18:00:49.377Z" },
+    { url = "https://files.pythonhosted.org/packages/12/91/7db93e4689f0f145bcb954dc62936e5f5090548a9fa20c6bbebfaeaa648a/blake3-1.0.9-cp312-cp312-win_amd64.whl", hash = "sha256:15566065ff90ab3da46ec0be1417406f00507af902b6fb0fbc6563e77f02fc42", size = 218220, upload-time = "2026-06-22T18:00:50.659Z" },
+]
+
 [[package]]
 name = "blinker"
 version = "1.9.0"
@@ -471,6 +587,38 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/96/d32b941a501ab566a16358d68b6eb4e4acc373fab3c3c4d7d9e649f7b4bb/catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f", size = 17325, upload-time = "2023-09-25T06:29:23.337Z" },
 ]
 
+[[package]]
+name = "cbor2"
+version = "6.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/af/473c241e41c142ea06ebef8d1f660fa6ff928fb97210e7bec8ee5974f8cd/cbor2-6.1.2.tar.gz", hash = "sha256:6b43037a66947dee5af0abb1a4c3a13b3abac5a4a3f32f9771efbbcd030fd909", size = 86760, upload-time = "2026-06-02T19:01:29.333Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/3f/37771defcae022510d640df8e420b7968c01804c084ff8cd2b9021c8873b/cbor2-6.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ffda338fe434d8d37e92e0d2e8f66432f0aa983f769dd2417f1eb6dfce634d3", size = 412096, upload-time = "2026-06-02T19:00:21.183Z" },
+    { url = "https://files.pythonhosted.org/packages/13/ab/a10563c43a937a5fc0c5c52ee14f8380c7ba66634294759cc3dd3697d521/cbor2-6.1.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:715112c1087bc65f26d50ed4ffaaa214cbd398fbfb0d1a45f7edf555e77c7ca6", size = 457955, upload-time = "2026-06-02T19:00:22.989Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/a9/443cb3f0b086cbb78e3df098bce6f8fb6cabc39b9ea5b46bca27b7adf4ad/cbor2-6.1.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:86b030a6accec1b4a58387e27edb656921c4b6d5d36d60f05d19915526233402", size = 468656, upload-time = "2026-06-02T19:00:24.549Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/ed/2b2446767225078c023fd32523f84dceecb2a94e7ac7259b27d1527a5eac/cbor2-6.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:fe007e47f6edb828cc97af256ce3452f57431cb8841302c3c28543efc7c9e037", size = 523323, upload-time = "2026-06-02T19:00:25.851Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/71/cfe388abc06d59e8393a1a5fa260d5412b5a68963de0ef0e79f6395a3cb7/cbor2-6.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:95e3d99160f105b8b6bccb1033c9c14e8ca7c450d8999363882d87357313b78e", size = 534929, upload-time = "2026-06-02T19:00:27.61Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/e8/8b454b8d405d9a66935b47bf5d9b045147bcf86f7747161598a32e5169ad/cbor2-6.1.2-cp310-cp310-win32.whl", hash = "sha256:464abc44b6863f888c9e263078e52395bddc03f20a3bd59f58fff581788fea51", size = 284490, upload-time = "2026-06-02T19:00:29.347Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/3e/ecce89144cd820ba6f528debedff4948b6022996d3fcc4715e69f6acb483/cbor2-6.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:a3d1699de84d8aec4e9c6c3fdd450d86fac183a542733f0cac36a4317db2375a", size = 301090, upload-time = "2026-06-02T19:00:30.619Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/cd/92f77e8bdfef427c6617cd4b02898e9d88861db8dcc973cc8b2c29a51582/cbor2-6.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:925ebc6d26a0d3aa81377bdcfd8d44d166f4f6a5ee77467a9d6f3ed1487fc499", size = 292330, upload-time = "2026-06-02T19:00:31.946Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1e/687d7a712755c84a4b823ca79622dceef7ddfb0a3387b6ac1cad10835e07/cbor2-6.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6ce8f6d9e234bdf36b5300bf3da98fafc198b253f8dfe77747327806bdb37d97", size = 411738, upload-time = "2026-06-02T19:00:33.396Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/d3/a96162ac244e074f9c188ffd29c086c51466e71c7c360189f6204900db3d/cbor2-6.1.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9f81ab0e74671b0ff9b7e30386e2ab8d40ee1049d13c1680b57ab1b1cd95c81a", size = 457945, upload-time = "2026-06-02T19:00:34.729Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/f3/7fed7cee8456932d38e7b11d5034470ee9e91378d16f762c552e78df34fa/cbor2-6.1.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:5a429fc61db768c3b4739eb8532556eed86913ad64fe6ebbc1f3a646fb9a4f22", size = 468758, upload-time = "2026-06-02T19:00:35.882Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/e9/bb31f04c5afa53eb55927da1399cc596d7e84e7053de7abf2c3aba0ea3a9/cbor2-6.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:511999bf3310c6641d3d15ee3853daa7ebd6ef3130bb0d63b9a7e2fd720a3714", size = 523169, upload-time = "2026-06-02T19:00:37.422Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/7f/90faf18c280abb49428ed2e78f672ef0c7f6eb1b9b685bc4fe810f2e5e95/cbor2-6.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3099e678283efd2d3cabd6ddcb770da6e2102c0d265f98bca38aa4e720e247cf", size = 534885, upload-time = "2026-06-02T19:00:38.972Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/8a/447aea5da80847bb17ca4718cd4909a2dc8dfe6f68ede4fe29f94b4ca12c/cbor2-6.1.2-cp311-cp311-win32.whl", hash = "sha256:0ef832ac8152ca76a69c184fe401329629b7dfd5fdddd713121bf1ff6d21660f", size = 284601, upload-time = "2026-06-02T19:00:40.426Z" },
+    { url = "https://files.pythonhosted.org/packages/55/95/6239187639a875eb83b924c16f4938d3d735c9c45474008c8b962bd55da2/cbor2-6.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:08cdc03d65e965aafd04c3bf9cb54b8cba55041756bd39d0ba6cd62bd060f959", size = 301284, upload-time = "2026-06-02T19:00:41.693Z" },
+    { url = "https://files.pythonhosted.org/packages/76/cb/e5f92271747a0331ca9151fac4098f8e245f1b09623ddff1258967a35b01/cbor2-6.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:0e2cfd25a395d454990d67148103107293c6506c3b0b15952a6e97f53d23deda", size = 292228, upload-time = "2026-06-02T19:00:43.27Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/0c/a857b6ca032282b564cf25de18ad92fe0614e8b3fa3422eb10e32a873939/cbor2-6.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:92b158d3ff9d9dce70eeb09786a6e518e3cb0ecb927fd23e9a0f7fc4b175c01a", size = 409592, upload-time = "2026-06-02T19:00:44.556Z" },
+    { url = "https://files.pythonhosted.org/packages/29/db/e0518153b3228159d9373f3b5785d7ea2d68898e27ee1bce7d03f0b5f7aa/cbor2-6.1.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d29a11044b07048e19f39a87fe8fea7ea865eb0ace50dc4c29513d52d40e2ddf", size = 454598, upload-time = "2026-06-02T19:00:45.784Z" },
+    { url = "https://files.pythonhosted.org/packages/29/67/62127b22edc6011ba55b76a28ab7c2219a45d01871a8199532e0978b26d1/cbor2-6.1.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a106f174eda34d8937a621c7f3e6044586cb209170cdc8da0ffbea89d1d6e385", size = 467380, upload-time = "2026-06-02T19:00:47.196Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/95/7992d8ec904c116ad547abb4960cc3fde695d5853c66596b1465d14d2f7b/cbor2-6.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ea16a25cc457a92879ff7a36cc50b587bddba09d8176bf1a94803eec5aa27eb", size = 521672, upload-time = "2026-06-02T19:00:48.656Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/cf/80cc4be132a523f0c92fb4c71813577bb393abea9e27990ca74605e0e930/cbor2-6.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2652a94224980d47f2a3866dd35b1afe532ecdfaf91f8cfcec39a026c457a844", size = 534402, upload-time = "2026-06-02T19:00:50.064Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/ea/99e466d8bef61a0775a1d8538ae6c9d95f4533fadc01f8f7814cb7ab80ad/cbor2-6.1.2-cp312-cp312-win32.whl", hash = "sha256:618666292900487db4a5abcade3150105c9c9fdd22576e6ff297c9a72eef0c6a", size = 283225, upload-time = "2026-06-02T19:00:51.406Z" },
+    { url = "https://files.pythonhosted.org/packages/14/13/e6a677bdc499e43049006cb54fe605b0f7aef621402d31354cc42ef293c9/cbor2-6.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:c61c0b2e2cee64497e6c62d1976bc212f62ac0cd2b5b903613610d79b8b06b60", size = 300844, upload-time = "2026-06-02T19:00:52.628Z" },
+    { url = "https://files.pythonhosted.org/packages/77/4a/08bd8461f8e2e1ce1de5ae2768f2b7ca39a090e3156c1ee0d9b5fd86e70d/cbor2-6.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c871e7266ddc545b258e6f8e5300396985dc485d7ccf8bb4777385782f302153", size = 289040, upload-time = "2026-06-02T19:00:53.971Z" },
+]
+
 [[package]]
 name = "certifi"
 version = "2025.11.12"
@@ -671,7 +819,8 @@ dev = [
     { name = "httpx" },
     { name = "locust" },
     { name = "mypy" },
-    { name = "openai" },
+    { name = "openai", version = "2.13.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "openai", version = "2.43.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-bdd" },
@@ -694,6 +843,10 @@ docs = [
 llm = [
     { name = "bitsandbytes" },
     { name = "kernels" },
+    { name = "langchain-core", marker = "python_full_version >= '3.11'" },
+    { name = "langchain-nvidia-ai-endpoints", marker = "python_full_version >= '3.11'" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.11'" },
+    { name = "lm-format-enforcer" },
     { name = "triton", marker = "sys_platform == 'linux'" },
     { name = "trl" },
 ]
@@ -702,6 +855,9 @@ mcp = [
     { name = "loguru" },
     { name = "mcp", extra = ["cli"] },
 ]
+vllm = [
+    { name = "vllm" },
+]
 
 [package.dev-dependencies]
 dev = [
@@ -709,7 +865,9 @@ dev = [
     { name = "httpx" },
     { name = "locust" },
     { name = "mypy" },
-    { name = "openai" },
+    { name = "ollama" },
+    { name = "openai", version = "2.13.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "openai", version = "2.43.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-bdd" },
@@ -732,6 +890,10 @@ docs = [
 llm = [
     { name = "bitsandbytes" },
     { name = "kernels" },
+    { name = "langchain-core", marker = "python_full_version >= '3.11'" },
+    { name = "langchain-nvidia-ai-endpoints", marker = "python_full_version >= '3.11'" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.11'" },
+    { name = "lm-format-enforcer" },
     { name = "triton", marker = "sys_platform == 'linux'" },
     { name = "trl" },
 ]
@@ -740,6 +902,9 @@ mcp = [
     { name = "loguru" },
     { name = "mcp", extra = ["cli"] },
 ]
+vllm = [
+    { name = "vllm" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -760,11 +925,15 @@ requires-dist = [
     { name = "huggingface-hub", specifier = "~=0.34.0" },
     { name = "ijson", specifier = "~=3.1.4" },
     { name = "kernels", marker = "extra == 'llm'", specifier = "~=0.11.7" },
+    { name = "langchain-core", marker = "python_full_version >= '3.11' and extra == 'llm'", specifier = "~=1.2.9" },
+    { name = "langchain-nvidia-ai-endpoints", marker = "python_full_version >= '3.11' and extra == 'llm'", specifier = "~=1.0.4" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.11' and extra == 'llm'", specifier = "~=1.1.8" },
+    { name = "lm-format-enforcer", marker = "extra == 'llm'", specifier = "~=0.11.3" },
     { name = "locust", marker = "extra == 'dev'", specifier = "<2.32.0" },
     { name = "loguru", marker = "extra == 'mcp'", specifier = "~=0.7.3" },
     { name = "mcp", extras = ["cli"], marker = "extra == 'mcp'", specifier = "==1.26.0" },
     { name = "medcat", extras = ["deid", "meta-cat", "rel-cat", "spacy"], specifier = "~=2.3.0" },
-    { name = "mlflow", specifier = "~=2.16.2" },
+    { name = "mlflow", specifier = "~=2.22.1" },
     { name = "mypy", marker = "extra == 'dev'", specifier = "~=1.18.0" },
     { name = "openai", marker = "extra == 'dev'", specifier = ">=1.84.0" },
     { name = "peft", specifier = "<0.14.0" },
@@ -799,9 +968,10 @@ requires-dist = [
     { name = "types-requests", marker = "extra == 'dev'", specifier = ">=2.31.0.6" },
     { name = "types-toml", marker = "extra == 'dev'", specifier = "==0.10.8.20240310" },
     { name = "uvicorn", specifier = "~=0.31.1" },
+    { name = "vllm", marker = "extra == 'vllm'", specifier = "<0.15.0" },
     { name = "websockets", specifier = "~=12.0" },
 ]
-provides-extras = ["dev", "docs", "llm", "mcp"]
+provides-extras = ["dev", "docs", "llm", "mcp", "vllm"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -809,6 +979,7 @@ dev = [
     { name = "httpx", specifier = "~=0.27.1" },
     { name = "locust", specifier = "<2.32.0" },
     { name = "mypy", specifier = "~=1.18.0" },
+    { name = "ollama", specifier = ">=0.6.0" },
     { name = "openai", specifier = ">=1.84.0" },
     { name = "pytest", specifier = "~=7.1.2" },
     { name = "pytest-asyncio", specifier = "~=0.23.7" },
@@ -832,6 +1003,10 @@ docs = [
 llm = [
     { name = "bitsandbytes", specifier = "==0.49.0" },
     { name = "kernels", specifier = "~=0.11.7" },
+    { name = "langchain-core", marker = "python_full_version >= '3.11'", specifier = "~=1.2.9" },
+    { name = "langchain-nvidia-ai-endpoints", marker = "python_full_version >= '3.11'", specifier = "~=1.0.4" },
+    { name = "langchain-openai", marker = "python_full_version >= '3.11'", specifier = "~=1.1.8" },
+    { name = "lm-format-enforcer", specifier = "~=0.11.3" },
     { name = "triton", marker = "sys_platform == 'linux'", specifier = "~=3.5.0" },
     { name = "trl", specifier = "~=0.15.0" },
 ]
@@ -840,6 +1015,7 @@ mcp = [
     { name = "loguru", specifier = "~=0.7.3" },
     { name = "mcp", extras = ["cli"], specifier = "==1.26.0" },
 ]
+vllm = [{ name = "vllm", specifier = "<0.15.0" }]
 
 [[package]]
 name = "colorama"
@@ -850,6 +1026,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 
+[[package]]
+name = "compressed-tensors"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "loguru" },
+    { name = "pydantic" },
+    { name = "torch" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/79/4c5c1cd14266f8cf2650bdb940f986ce7fcaeb56aad8cfa9e9afedf14e2f/compressed_tensors-0.12.2.tar.gz", hash = "sha256:5bb40856dd17f128ab73557ecc73799f80db4dd82fab6de875f1e6899b9ea0c4", size = 190409, upload-time = "2025-10-07T14:30:59.302Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f0/c0/1695b87d369e6652ec0d650912e02eca2151c5e9c29244f94d2afccfe970/compressed_tensors-0.12.2-py3-none-any.whl", hash = "sha256:e554ea761710ca2b0c0ea49276a4ef8e08658624f1591e6a7368817106b48fbe", size = 183049, upload-time = "2025-10-07T14:30:56.523Z" },
+]
+
 [[package]]
 name = "confection"
 version = "0.1.5"
@@ -1071,6 +1262,87 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0d/c3/e90f4a4feae6410f914f8ebac129b9ae7a8c92eb60a638012dde42030a9d/cryptography-46.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c", size = 3438528, upload-time = "2025-10-15T23:18:26.227Z" },
 ]
 
+[[package]]
+name = "cuda-bindings"
+version = "13.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cuda-pathfinder" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/21/8464d133752951c154feafb3b65c297e7d80f301183d220bec4c830f1441/cuda_bindings-13.3.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:120fcc53d57903df529c3486962c56528cba5b7d6c57c99537320ed9922c8b86", size = 6073403, upload-time = "2026-05-29T23:11:36.22Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/1f/5ef51f5fbaa5d4d3201bb3d7555af028ec1aa4416275ccbf73c9e34e3d2d/cuda_bindings-13.3.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9851b0caa8bfd3bc6fa054eaf57bea7c8e9c3a62db2d2621224677f49f3c53d0", size = 6675244, upload-time = "2026-05-29T23:11:38.664Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/64/bb17e4d168569ef7be05c44474fe3dc19278d60a69ba228e45a431c86444/cuda_bindings-13.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:c0c4b1a995098c46695c24257a342dc97d6e6d3f3050b944c9f43bd26d734051", size = 5625597, upload-time = "2026-05-29T23:11:40.808Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6b/457ca12dad3ee9bfcc9a545cfd6b64b359ba49de40f776f6e028e678f262/cuda_bindings-13.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5879712accf6e14bb01aa5e67440eb84998b8d104b509cc7a6dc0b8f656a474", size = 6053539, upload-time = "2026-05-29T23:11:43.19Z" },
+    { url = "https://files.pythonhosted.org/packages/95/7a/c5e3c34a409b148f5c0f5a4ea374158f95d488862c1dffedf9aa5c639df9/cuda_bindings-13.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:04436a9364059c84b8f9636f359eccda1cf814341f5b670c71d80d2f79dbc708", size = 6674166, upload-time = "2026-05-29T23:11:45.478Z" },
+    { url = "https://files.pythonhosted.org/packages/93/f7/0e35987a21914f84068061dcf4b61466ccbce1c62ddc9727596d5ed0c26f/cuda_bindings-13.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:507b0e19e7f934c5e30f30f0244ad70a75812619a7d3a0d742543caae1bd50f1", size = 5664286, upload-time = "2026-05-29T23:11:47.719Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/67/5e7dba1ba576dd73da5dee894ca076ca5e959450dfff66d6d510a255d1f7/cuda_bindings-13.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7855c4868aabc0cfae28abbe83d56734bdfbd08f08fc234ac1912a12858bf49", size = 6025351, upload-time = "2026-05-29T23:11:49.685Z" },
+    { url = "https://files.pythonhosted.org/packages/39/2a/6d2e9047d1fb243dbaa364b01e0297534b9ed7fd27dba1c9f361519cf69b/cuda_bindings-13.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e32d08f71ebcdf00f0f41eab2eb37e8da94c8ed411cc9f7f7a019ce6b34abe3a", size = 6657965, upload-time = "2026-05-29T23:11:52.227Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/95/872a0392122f1fb43fcb06869790ef3171f37beee9f7db8f441739113570/cuda_bindings-13.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:b134dd8c5c66ae4c4ad814f7aee88fd215353c077010cbc47e3b55ed35ec9eff", size = 5875099, upload-time = "2026-05-29T23:11:54.635Z" },
+]
+
+[[package]]
+name = "cuda-core"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "backports-strenum", marker = "python_full_version < '3.11'" },
+    { name = "cuda-pathfinder" },
+    { name = "numpy" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/21/ef85f3e15d394c9ca41fe116d78cd9e28533b9d7ead842f9241b332acf01/cuda_core-1.0.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9632db74eceb1cd72a7c95b61a5e4cfb9cc2291de0503e170334d936cab3316", size = 4788165, upload-time = "2026-05-12T20:11:17.116Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/41/c2c07b313c6cbb5d93010200c62b01ddb9f6c6f43a096a75c7b902c42ad6/cuda_core-1.0.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46690b0864417a5f2f9a7d10408e2570cbacae195c890a41286701eefb01ba79", size = 5061723, upload-time = "2026-05-12T20:11:19.767Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/2d/b16b0af698a1bf4db337345daa7a44cd372fef107a3b692ffe1e0e6c5cc5/cuda_core-1.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:1693fae113604cf9c114bfe3da15a15981f9d44ae78ecceb0b23e24c628ad19b", size = 4742003, upload-time = "2026-05-12T20:11:21.943Z" },
+    { url = "https://files.pythonhosted.org/packages/41/4b/4ac1d0639241da756c634add606f93a7f3a39bef12f70e1fb4b40cc53c21/cuda_core-1.0.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3effd11283bc46fd06348c2fd18a0941ba7718a6f447343858c944c1a93a6dab", size = 4784340, upload-time = "2026-05-12T20:11:23.961Z" },
+    { url = "https://files.pythonhosted.org/packages/01/55/bb3e701f4af504e5e39e837135dc80022ec4c84858b2886ad577fe696a77/cuda_core-1.0.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1934517ff8a9dcd21b3f4a28e15e12643164b7d3ec187a4ee7560e22fd2dfc17", size = 5059041, upload-time = "2026-05-12T20:11:26.045Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/e3/3ffaca2eabc71d0f9d29368fabc8ffb309353f05f418ea4c7eb5f223cf09/cuda_core-1.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:95c91d434a9baca066646cefa577227385104670a02fbe8e3defaadda84becf5", size = 4746198, upload-time = "2026-05-12T20:11:28.405Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/a0/1daeae599cadd612689dbbf70d7da1c01883964fc2fbc7386f3c630a68cf/cuda_core-1.0.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6816dc020aee6103d8071bc02d8e4e1d91f2b49596f666896d608d92224d79d1", size = 4789856, upload-time = "2026-05-12T20:11:30.862Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/4d/603557ab3cb171cc2a61d3678a39cb4dae3fd21275078bfbd1c0b0b5230b/cuda_core-1.0.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be7b65311bf78964b7905adbf3c0f8f717d432f2854dc45169277729bf60f1e2", size = 5106023, upload-time = "2026-05-12T20:11:33.509Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/1a/ae079963c9df7f4274227eb63cf8f6083a532a6443adb340d951fd21c626/cuda_core-1.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:1a5c1aa3b738a7599ea289498d038fe625d259fd7ab795394541eee58a8e29bc", size = 4663076, upload-time = "2026-05-12T20:11:35.784Z" },
+]
+
+[[package]]
+name = "cuda-pathfinder"
+version = "1.5.5"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/c8/26f2e4aae92f11522a96043892ba39a90eac610d5242523aa863212bc1c7/cuda_pathfinder-1.5.5-py3-none-any.whl", hash = "sha256:0228c023f95d1480f143ef5c8922d27a2ab052087a942e81dc289c9eb8f91689", size = 51671, upload-time = "2026-05-27T01:21:25.413Z" },
+]
+
+[[package]]
+name = "cuda-python"
+version = "13.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cuda-bindings" },
+    { name = "cuda-core" },
+    { name = "cuda-pathfinder" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/31/7ff3f7768eded7535c621abc2fecb9d181a34ea4cae3afe682feb796f242/cuda_python-13.3.1-py3-none-any.whl", hash = "sha256:280b014139ab447b6dd70a377db1596f310d6e887d9d342e6651b919ec145fb3", size = 8295, upload-time = "2026-05-29T23:28:47.012Z" },
+]
+
+[[package]]
+name = "cupy-cuda12x"
+version = "14.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cuda-pathfinder" },
+    { name = "numpy" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/18/8ec57a901a11d6955f90e1fbf3e04c8f26721066c99dfa25276e3e3b1f1d/cupy_cuda12x-14.1.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:909c4b8ac05eee43edfbe791522ee5d593e3504be7bd5c20e2de12b050db2a26", size = 143787561, upload-time = "2026-06-01T04:51:46.125Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/79/6a4e1562b3b6b18e93365955adfd4f66a84b60bdacf559becc0e3e0f1012/cupy_cuda12x-14.1.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:71b8de628a4a9ab24b6cc2af2162db2898e65a44a50a6d79cbc131c4f36405de", size = 132662808, upload-time = "2026-06-01T04:51:54.719Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/df/39530cffd84a00dfe98484a9ece77eed4acdc14717e1f77fa2a3e82a40dc/cupy_cuda12x-14.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:519e50b7dec2400e3fddbe9e6c4066937fd622a16774f375ab5be9d1cb1ea05e", size = 95338688, upload-time = "2026-06-01T04:51:59.685Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/65/13c173fec4923b9c4e1573344fc4a5585bf0e4efe5d9a5632e9bb18b2a31/cupy_cuda12x-14.1.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:5d4c1c74f9f7fc9de0aa5781cf3ec54f9f05143f5761e21a8798772c8eedd0be", size = 145089362, upload-time = "2026-06-01T04:52:05.643Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/5e/ccd2fea320ece269dd7237649da384cad71fbb1ba30937a1eb3311c31b77/cupy_cuda12x-14.1.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:8889cb83dbb7dbea593e60c85fcc91e21b0ccd10cd5380dfdfaac70b6bd9390a", size = 134012855, upload-time = "2026-06-01T04:52:11.526Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/59/93970d536e8401cf31d8f5602141f1c2edfc304e6d6b8702041688509509/cupy_cuda12x-14.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:e39a081fc4fe2166f95ef8be38fe2a95b2c4decb3ec991b4f26bfc9673d16b17", size = 95336905, upload-time = "2026-06-01T04:52:17.262Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/6e/290ee2d7cc4ad63d66e67acfd7ff3026f2b648dd04449a1bf88ffaa36b1e/cupy_cuda12x-14.1.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:7aae7d3bed37985e2aa39f0914b88ad90dbd3a6141d3e8198d73fce65859013c", size = 144383812, upload-time = "2026-06-01T04:52:23.799Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/6e/dc03c1ddc940f33b3d32803898e2fdae5c9538a2127a25f499494c84b183/cupy_cuda12x-14.1.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:a1138f20080489a46209291498cd12f792226d0a57d50c64a586c162a875a069", size = 133516927, upload-time = "2026-06-01T04:52:35.765Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/da/d4a8045b533af634bc791572e8c87981065e4a27b5d3e09d0d4d285742fd/cupy_cuda12x-14.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:85bebce86ffc25ecf31727b25da7b3793daf07b6fd9952704546af574d250988", size = 95238722, upload-time = "2026-06-01T04:52:46.296Z" },
+]
+
 [[package]]
 name = "cycler"
 version = "0.12.1"
@@ -1114,16 +1386,17 @@ wheels = [
 
 [[package]]
 name = "databricks-sdk"
-version = "0.74.0"
+version = "0.76.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "google-auth" },
-    { name = "protobuf" },
+    { name = "protobuf", version = "4.25.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "protobuf", version = "5.29.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fd/69/c91d6730adf3f335f30c3e2e3cab7ad56eea84aff159e03a3af3afa9803f/databricks_sdk-0.74.0.tar.gz", hash = "sha256:321c758c14937ca7ad106d262219a03efaedfd18e2c5a75b3908c882970376ac", size = 811966, upload-time = "2025-12-10T12:01:30.831Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/70/82/5efcfdca8779c84b5c6f61cc110d0938c9818e422f55c36a68d96b98c61f/databricks_sdk-0.76.0.tar.gz", hash = "sha256:fcfce4561b090b3c8e9cac2101f549766d9fb3bece31bb5720571919fa37d210", size = 822376, upload-time = "2025-12-17T17:11:31.907Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/6b/e75557df7c8210a6b362640251f85fdbfd8cd1e3a220ca4a0e3b0e27aa27/databricks_sdk-0.74.0-py3-none-any.whl", hash = "sha256:c04c5ed14bcc5a8df3e630088050adff54bf06dd4adf2ecb6bef6e68e5e545e6", size = 764214, upload-time = "2025-12-10T12:01:29.033Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/96/ee7742b94f996560c57d6fb8d2e10eab3c489e8a72187369ed0917baf8aa/databricks_sdk-0.76.0-py3-none-any.whl", hash = "sha256:6696dda22bc52c8f50a50d24e6ccd1c855f92c0f68f5afe4eb2e77d5b1b1a65f", size = 774688, upload-time = "2025-12-17T17:11:29.925Z" },
 ]
 
 [[package]]
@@ -1163,6 +1436,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" },
 ]
 
+[[package]]
+name = "depyf"
+version = "0.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "astor" },
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/88/35/83fb0178212279aa0af031031905804c6de5618435d229f41ed21bb9ad2c/depyf-0.20.0.tar.gz", hash = "sha256:fb7683bd72c44f67b56029df2c47721e9a02ffa4d7b19095f1c54c4ebf797a98", size = 6168761, upload-time = "2025-10-13T12:33:38.589Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/65/4df6936130b56e1429114e663e7c1576cf845f3aef1b2dd200c0a5d19dba/depyf-0.20.0-py3-none-any.whl", hash = "sha256:d31effad4261cebecb58955d832e448ace88f432328f95f82fd99c30fd9308d4", size = 39381, upload-time = "2025-10-13T12:33:33.647Z" },
+]
+
+[[package]]
+name = "detect-installer"
+version = "0.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5f/ce/6897d812825e9d4c53e3c7112726e800cc5231b013b2223bf64f653ff362/detect_installer-0.1.0.tar.gz", hash = "sha256:00ad7ba0a36e3cf7d08a40d3643011746dbc112597c7d475cc91c416710ca4e7", size = 3049, upload-time = "2026-02-23T10:40:22.567Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cc/34/8cc73273414405086c58852916e4031812a6a30fe04c057e37ad99397b7f/detect_installer-0.1.0-py3-none-any.whl", hash = "sha256:034fb20fd665c36e6ba52b8821525ea07fb4f7f938cac459df889fb33801528a", size = 4539, upload-time = "2026-02-23T10:40:23.807Z" },
+]
+
 [[package]]
 name = "dill"
 version = "0.3.8"
@@ -1172,6 +1467,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252, upload-time = "2024-01-27T23:42:14.239Z" },
 ]
 
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" },
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -1204,6 +1508,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" },
 ]
 
+[[package]]
+name = "docstring-parser"
+version = "0.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
+]
+
 [[package]]
 name = "docutils"
 version = "0.20.1"
@@ -1213,6 +1526,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/26/87/f238c0670b94533ac0353a4e2a1a771a0cc73277b88bff23d3ae35a256c1/docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", size = 572666, upload-time = "2023-05-16T23:39:15.976Z" },
 ]
 
+[[package]]
+name = "einops"
+version = "0.8.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
+]
+
 [[package]]
 name = "email-validator"
 version = "2.1.2"
@@ -1275,6 +1597,57 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/47/14a76b926edc3957c8a8258423db789d3fa925d2fed800102fce58959413/fastapi-0.120.4-py3-none-any.whl", hash = "sha256:9bdf192308676480d3593e10fd05094e56d6fdc7d9283db26053d8104d5f82a0", size = 108235, upload-time = "2025-10-31T18:37:27.038Z" },
 ]
 
+[package.optional-dependencies]
+standard = [
+    { name = "email-validator" },
+    { name = "fastapi-cli", extra = ["standard"] },
+    { name = "httpx" },
+    { name = "jinja2" },
+    { name = "python-multipart" },
+    { name = "uvicorn", extra = ["standard"] },
+]
+
+[[package]]
+name = "fastapi-cli"
+version = "0.0.27"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rich-toolkit" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "typer" },
+    { name = "uvicorn", extra = ["standard"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/37/d0/ee5678346811967b8d096d5d5604e71b50d6bf5a2abfbdb331157e2bbaa9/fastapi_cli-0.0.27.tar.gz", hash = "sha256:1dffb1e40c0c88f2e0171a8a252a2b615c1e63ff8c05626649e4badd6a84336a", size = 23630, upload-time = "2026-06-18T14:48:43.421Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1a/ab/0a709f9488fe62647db80f8a277fb0ee62e85adc6746abf477ed373c9eb7/fastapi_cli-0.0.27-py3-none-any.whl", hash = "sha256:2e389a40f318e29fec8cb1e289f267f17c048876fb82dbfa869a10b16740495d", size = 13070, upload-time = "2026-06-18T14:48:44.311Z" },
+]
+
+[package.optional-dependencies]
+standard = [
+    { name = "fastapi-cloud-cli" },
+    { name = "uvicorn", extra = ["standard"] },
+]
+
+[[package]]
+name = "fastapi-cloud-cli"
+version = "0.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "detect-installer" },
+    { name = "fastar" },
+    { name = "httpx" },
+    { name = "pydantic", extra = ["email"] },
+    { name = "rich-toolkit" },
+    { name = "rignore" },
+    { name = "sentry-sdk" },
+    { name = "typer" },
+    { name = "uvicorn", extra = ["standard"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/bf/97d19633c6ec6fb0ef59df474b9705ea992f7b4f879208d0007ac6d25ab6/fastapi_cloud_cli-0.20.0.tar.gz", hash = "sha256:9681c46adcd299024d0775658bd5d88992fd35c4ad42b1f045c6df913390ba37", size = 85904, upload-time = "2026-06-11T17:41:02.814Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/6e/bbb2e1b8f3170b6426b707d49981a838fc1d5cbb428dd9a271f1c3951c23/fastapi_cloud_cli-0.20.0-py3-none-any.whl", hash = "sha256:dcbf071fc659ae2d3fb30e221a661c3fa240b7d5091203cf941face31f6d7860", size = 68793, upload-time = "2026-06-11T17:41:01.804Z" },
+]
+
 [[package]]
 name = "fastapi-users"
 version = "15.0.4"
@@ -1305,6 +1678,74 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/10/1c/24dc536af724f2e02ce8234dff8ab2ebc4091f3244321cea8ef3b79607ce/fastapi_users_db_sqlalchemy-5.0.0-py3-none-any.whl", hash = "sha256:7c9965555e94335d432f82f555b523809e1c37ed3ccd6ee1c7c9c0ae3240ea85", size = 6893, upload-time = "2023-02-13T16:13:59.863Z" },
 ]
 
+[[package]]
+name = "fastar"
+version = "0.11.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/0f/0aeb3fc50046617702acc0078b277b58367fd62eb727b9ec733ae0e8bbcc/fastar-0.11.0.tar.gz", hash = "sha256:aa7f100f7313c03fdb20f1385927ba95671071ba308ad0c1763fef295e1895ce", size = 70238, upload-time = "2026-04-13T17:11:17.143Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/4a/0d79fe52243a4130aa41d0a3a9eea22e00427db761e1a6782ee817c50222/fastar-0.11.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e7c906ad371ca365591ebcb7630009923f3eceb20956814494d15591a78e9e46", size = 709786, upload-time = "2026-04-13T17:09:53.974Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/e4/77c94eaafc035e39f5ce5176e32743da4e3fe890f28790e708e53d8f75cd/fastar-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6919497b35fa5bd978d2c26ee117cf1771b90ee5073f7518e44b9bc364b57715", size = 632127, upload-time = "2026-04-13T17:09:39.023Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/f6/97658dd992f4e45747d35adb24c0b100f6b6d451490685ae3fe8a3a2ee1b/fastar-0.11.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:56b50206aeedd99e22b83289e6fb3ff8f7d7da4407d2419902e4716b4f90585a", size = 869608, upload-time = "2026-04-13T17:09:08.268Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/fc/81c1ec4d8146a437399e7b95631b51be312f323a9ce64569f932db6c3914/fastar-0.11.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a1811a69ae81d469720df0c8af3f84f834a93b5e4f8be0e0e8bde6a52fa11f2", size = 762925, upload-time = "2026-04-13T17:07:52.788Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/35/49baf480ecb197aea7ce2515c503a2f25061958dd3b4c98e98a3a11cdcc7/fastar-0.11.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:10486238c55589a3947c38f9cfb88a67d8a608eb8dddc722038237d0278a41d7", size = 759913, upload-time = "2026-04-13T17:08:07.324Z" },
+    { url = "https://files.pythonhosted.org/packages/94/eb/946f1980267f2824efb7d7c518d47a49b89c0e9cd7c449301f5a7531558a/fastar-0.11.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1555ef9992d368a6ec39092276990cef8d329c39a1d86ebd847eaa3b10efd472", size = 926054, upload-time = "2026-04-13T17:08:22.196Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/19/d5eb611085ce054382570d8d4e24a5e2ff23cd6d2404528a6643841d6059/fastar-0.11.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b1f4aca0a9620b76988bbf6225cdea6678a392902444ca18bb8a51495b165a89", size = 818594, upload-time = "2026-04-13T17:08:52.366Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/52/18e8d55c0d3d917713f381cb2d0cb793da00c209c802e011d8dc72018cd5/fastar-0.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75beeecac7d11a666a6c4a0b7f7e80842ae5cf523f2f890b99c78fc82b403545", size = 823005, upload-time = "2026-04-13T17:09:23.051Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/b4/0fecdcf33e5aaffe777b96a1c10a3204fe0b05bf18e971033a0bfedafc1c/fastar-0.11.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a08cdf5d16daa401c65c9c7493a18db7dc515c52155a17071ec7098bb07da9d3", size = 887115, upload-time = "2026-04-13T17:08:37.385Z" },
+    { url = "https://files.pythonhosted.org/packages/08/f8/2a6ad1c2523eb72a4595a9331162fc67ce0f0aee3348728598026c516986/fastar-0.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6e210375e5a7ba53586cbd6017aa417d2d2ceacbe8671682470281bd0a15e8ef", size = 973595, upload-time = "2026-04-13T17:10:09.258Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/a6/2aa48843228673feacc2b80876b8924e63ea9c5f5f607bd7a72416b86bae/fastar-0.11.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a2988eb2604b8e15670f355425e8c800e4dcd4edfbcbfe194397f8f17b7eb19e", size = 1036988, upload-time = "2026-04-13T17:10:26.133Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ac/3dd14b21c323e8484f47c910110d1d93139ba44621ac2c4c597dbe9fcdb7/fastar-0.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:34abc857b46068fdf91d157bd0203bfd6791dc7a432d1ed180f5af6c2f5bcce9", size = 1078267, upload-time = "2026-04-13T17:10:43.645Z" },
+    { url = "https://files.pythonhosted.org/packages/de/a1/3f89e58d6fa99160c9e7e17220c8ab5040b5cc017c4fac2356c6ed18453d/fastar-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0d884be84e37a01053776395441fc960031974e0265801ce574efc3d05e0cdaf", size = 1032551, upload-time = "2026-04-13T17:11:00.667Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/ea/24dd3cfc2096933d7d2a80c926e79602cff1fa481124ed2165b60c1dd9ef/fastar-0.11.0-cp310-cp310-win32.whl", hash = "sha256:c721c1ad758e3e4c2c1fd9e96911a0fa58c0a6be5668f1bcfd0b741e72c7cb63", size = 456022, upload-time = "2026-04-13T17:11:41.859Z" },
+    { url = "https://files.pythonhosted.org/packages/82/ef/6eb39ee9cdd59822d1c7337c4d28fdc948885bdf455af9e70efa9879e06f/fastar-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:ba4180b7c3080f55f9035fdd7d8c39fe0e1485087a68ff615bb4784a10b8106b", size = 488392, upload-time = "2026-04-13T17:11:27.486Z" },
+    { url = "https://files.pythonhosted.org/packages/11/7a/fb367bdaf4efa2c7952a45aeab2e87a564293ecffe150af673ec8edfda46/fastar-0.11.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:b82fd6f996e65a86f67a6bd64dd22ef3e8ae2dcaed0ae3b550e71f7e1bbb1df5", size = 709869, upload-time = "2026-04-13T17:09:55.62Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ff/b87efb0dcfd081c62c7c7601d7681dabe63103cd51fc16f8d57a1ab45961/fastar-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27eed386fd0558e6daa29211111bbd7b740f7c7e881197f8a00ac7c0f3cdb1d7", size = 631668, upload-time = "2026-04-13T17:09:40.537Z" },
+    { url = "https://files.pythonhosted.org/packages/24/7c/0ed6dd38b9adc04b3a8ec3b7045908e7c2170ba0ff6e6d2c51bc9fc770f3/fastar-0.11.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a6931bebc1d8e95ddeef55732c195449e6b44ef33aa31b325505097ed3b4d6aa", size = 869663, upload-time = "2026-04-13T17:09:09.78Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ce/8b7fb3f23855accebaaf2d2637eac7f261a7a5d936f861a172079f1ef511/fastar-0.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f72ce42a5e28a74fbd4d5fbf1a3ac1a1163d13cbc200cbd005fb0fabc54bd", size = 762938, upload-time = "2026-04-13T17:07:54.51Z" },
+    { url = "https://files.pythonhosted.org/packages/07/cc/5491e2b677bb841f768e3aba052d0344338a5c78aa5d4c18b443831a8e8d/fastar-0.11.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5b83c1f61f7017d6e1498568038f8745440cfc16ca2f697ec81bac83050108f6", size = 759232, upload-time = "2026-04-13T17:08:08.864Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/b7/643630bdbd179e41e9fae31c03b4cf6061dbf4d6fbbae8425d16eb12545d/fastar-0.11.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db73a9b765a516e73983b25341e7b5e0189733878279e278b2295131b0e3a21e", size = 926271, upload-time = "2026-04-13T17:08:23.68Z" },
+    { url = "https://files.pythonhosted.org/packages/09/5d/37ade50003b4540e0a53ef100f6692d7ab2ac1122d5acf39920cc09a3e8b/fastar-0.11.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:625827d52eb4e8fec942e0233f125ff8010fcf6a67c0a974a8e5f4666b771e3c", size = 818634, upload-time = "2026-04-13T17:08:54.268Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ff/135d177de32cc1e837c99019e4643e6e79352bde49544d4ece5b5eebf56b/fastar-0.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7f5fd8fa21ec0a88296a38dc5d7fc35efd3b26d46a17b8b7c73c5563925ca15", size = 822755, upload-time = "2026-04-13T17:09:25.01Z" },
+    { url = "https://files.pythonhosted.org/packages/27/cb/b835dbe76ceac7fa6105851468c259ffd06830eb9c029402e499d0ec153b/fastar-0.11.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:8c15af91b8cd87ddf23ea55355ae513c1de3ab67178f26dad017c9e9c0af6096", size = 887101, upload-time = "2026-04-13T17:08:39.248Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/54/aa8289eb57fc550535470397cb051f5a58a7c89ca4de31d5502b916dd894/fastar-0.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a112395a8b0bff251423bd1564c012f0cc058ad8b6bd8fba96f3d7fc117e44", size = 973606, upload-time = "2026-04-13T17:10:10.98Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/fd/776d50a0897c01dc6bfd0926772ee913436fdae91b9affaf0a0cbd09f0a1/fastar-0.11.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f2994bb8f5f8c11eb12beae1e6e77a907173c9819236b8a4c8f0573652ceccce", size = 1036696, upload-time = "2026-04-13T17:10:28.502Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/f1/cf0f9b499fb37ac065c8a01ec642f96a3c5eb849c38ae983b59f3b3245e0/fastar-0.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:dcf99e4b5973d842c7f19c776c3a83cdc0977d505edce6206438505c0456b517", size = 1078182, upload-time = "2026-04-13T17:10:45.318Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/9e/21e4701aec4a1123d4dc4d31578dc18875582b5710e4725f7ceb752a248b/fastar-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29c9c386dc0d5dda78845a8e6b1480d26ab861c1e0b68f42ae5735cb70ca07f1", size = 1032336, upload-time = "2026-04-13T17:11:02.364Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/e2/5872b28c72c27ec1a00760eace6ff35f714f41ebbd5208cf016b12e29250/fastar-0.11.0-cp311-cp311-win32.whl", hash = "sha256:030b2580fc394f2c9b7890b6735810404e9b9ed5e0344db150b945965b5482b7", size = 457368, upload-time = "2026-04-13T17:11:43.528Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/6e/ce6832a16193eb4466f4108be8809c249b51cb1f89dd7894545700d079d5/fastar-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:83ab57ae067969cd0b483ac3b6dccc4b595fc77f5c820760998648d4c42822b5", size = 488605, upload-time = "2026-04-13T17:11:29.161Z" },
+    { url = "https://files.pythonhosted.org/packages/15/5a/9cfb80661cf38fd7b0889224beb7d2746784d4ade2a931ed9775a18d8602/fastar-0.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:27b1a4cee2298b704de8151d310462ee7335ed036011ca9aa6e784b30b6c73a9", size = 464580, upload-time = "2026-04-13T17:11:18.583Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/06/a5773706afc8bd496769786590bbc56d2d0ee419a299cc12ea3f5717fcf3/fastar-0.11.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3c51f1c2cdddbd1420d2897ace7738e36c65e17f6ae84e0bfe763f8d1068bb97", size = 708394, upload-time = "2026-04-13T17:09:57.269Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/a6/d5e2a4e48495616440a21eed07558219ca90243ad00b0502586f95bd4833/fastar-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0d9d6b052baf5380baea866675dab6ccd04ec2460d12b1c46f10ce3f4ee6a820", size = 628417, upload-time = "2026-04-13T17:09:42.145Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/69/9816d69ac8265c9e50456637a487ccfb7a9c566efd9dbcd673df9c2558c2/fastar-0.11.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bd2f05666d4df7e14885b5c38fefd92a785917387513d33d837ff42ec143a22f", size = 863950, upload-time = "2026-04-13T17:09:11.506Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/0d/f88daad53aff2e754b6b5ff2a7113f72447a34f6ef17cc23ca99988117b7/fastar-0.11.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1e6e74aba1ae77ca4aedcaf1697cd413319f4c88a5ccbe5b42c709517c5097e", size = 760737, upload-time = "2026-04-13T17:07:55.958Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/a6/82ef4ecd969d50d92ed3ed9dbd8fe77faa24be5e5736f716edc9f4ce8d62/fastar-0.11.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38ef77fe940bbc9b37a98bd838727f844b11731cd39358a2640ff864fb385086", size = 757603, upload-time = "2026-04-13T17:08:10.623Z" },
+    { url = "https://files.pythonhosted.org/packages/03/35/50249f0d827251f8ac511495e2eacccebda80a00a0ad73e9615b8113b84f/fastar-0.11.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8955e61b32d6aff82c983217abf80933fd823b0e727586fc72f08043d996fd59", size = 923952, upload-time = "2026-04-13T17:08:25.526Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/d8/faee41659e9c379d906d24eaee6d6833ac8cfef0a5df480e5c2a8d3efb33/fastar-0.11.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:483532442cdb08fbff0169510224eae0836f2f672cea6aacb52847d90fefdc46", size = 816574, upload-time = "2026-04-13T17:08:56.076Z" },
+    { url = "https://files.pythonhosted.org/packages/22/47/0448ea7992b997dad2bf004bfd98eca74b5858630eae080b50c7b17d9ddc/fastar-0.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef5a6071121e05d8287fc75bccb054bcbac8bb0501200a0c0a8feeace5303ea4", size = 819382, upload-time = "2026-04-13T17:09:26.66Z" },
+    { url = "https://files.pythonhosted.org/packages/33/ef/0d63eb43586831b7a6f8b22c4d77125a7c594423af1f4f090fa9541b9b40/fastar-0.11.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:e45e598af5afe8412197d4786efd6cf29be02e7d3d4f6a3461149eae5d7e94f1", size = 885254, upload-time = "2026-04-13T17:08:40.9Z" },
+    { url = "https://files.pythonhosted.org/packages/01/25/edd584675d69e49a165052c3ee886df1c5d574f3e7d813c990306387c623/fastar-0.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2e160919b1c47ddb8538e7e8eb4cd527281b40f0bf75110a75993838ef61f286", size = 971239, upload-time = "2026-04-13T17:10:12.997Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/37/e8bb24f506ba2b08fbaf36c5800e843bd4d542954e9331f00418e2d23349/fastar-0.11.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:4bb4dc0fc8f7a6807febcebce8a2f3626ba4955a9263d81ecc630aad83be84c0", size = 1035185, upload-time = "2026-04-13T17:10:30.207Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/bf/be753736296338149ee4cb3e92e2b5423d6ba17c7b951d15218fd7e99bbf/fastar-0.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4ec95af56aa173f6e320e1183001bf108ba59beaf13edd1fc8200648db203588", size = 1072191, upload-time = "2026-04-13T17:10:47.072Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/cd/a81c1aaafb5a22ce57c98ae22f39c89413ed53e4ee6e1b1444b0bd666a6c/fastar-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:136cf342735464091c39dc3708168f9fdeb9ebea40b1ead937c61afaf46143d9", size = 1028054, upload-time = "2026-04-13T17:11:04.293Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/88/1ce4eed3d70627c95f49ca017f6bbbf2ddcc4b0c601d293259de7689bc20/fastar-0.11.0-cp312-cp312-win32.whl", hash = "sha256:35f23c11b556cc4d3704587faacbc0037f7bdf6c4525cd1d09c70bda4b1c6809", size = 454198, upload-time = "2026-04-13T17:11:45.168Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/1d/26ce92f4331cd61a69840db9ca6115829805eec24f285481a854f578e917/fastar-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:920bc56c3c0b8a8ca492904941d1883c1c947c858cd93343356c29122a38f44c", size = 486697, upload-time = "2026-04-13T17:11:31.084Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/96/e6eda4480559c69b05d466e7b5ea9170e81fef3795a73e059959a3258319/fastar-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:395248faf89e8a6bd5dc1fd544c8465113b627cb6d7c8b296796b60ebea33593", size = 462591, upload-time = "2026-04-13T17:11:20.577Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/5c/9bbeffbf1905391446dd98aa520422ce7affde5c9a7c22d757cc5d7c1397/fastar-0.11.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1266d6a004f427b0d61bd6c7b544d84cc964691b2232c2f4d635a1b75f2f6d5e", size = 711644, upload-time = "2026-04-13T17:10:07.663Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/af/ae5cf39d4fb82d0c592705f5ec6db1b065be5265c151b108f86126ee8773/fastar-0.11.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:298a827ec04ade43733f6ca960d0faec38706aa1494175869ea7ea17f5bad5d3", size = 634371, upload-time = "2026-04-13T17:09:52.083Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/36/8d4569e26473c72ccb02d1c5df3ed710073f1c06eca09c26d52ea79fd815/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8800e2387e463a0e5799416a1cbe72dd0fde7270a20e4bde684145e7878f6516", size = 870850, upload-time = "2026-04-13T17:09:21.439Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/46/724dc796e1756d3977970f820d30d59bb8cab8e3671b285f1d82ab513aec/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7496def0a2befd82d429cb004ef7ca831585cc887947bd6b9abb68a5ef852b0b", size = 764469, upload-time = "2026-04-13T17:08:05.638Z" },
+    { url = "https://files.pythonhosted.org/packages/99/e3/74d6859e632e8fb9339a14f652fb9f800c2bd6aa53071e311c0be3fbab8b/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:878eaf15463eb572e3538af7ca3a8534e5e279cf8196db902d24e5725c4af86e", size = 761375, upload-time = "2026-04-13T17:08:20.669Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/e7/cc70e2be5ef8731a7525552b1c35c1448cf9eae6a62cb3a56f12c1bf27ea/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0324ed1d1ef0186e1bbd843b17807d6d837d0906899d4c99378b02c5d86bdd9c", size = 928189, upload-time = "2026-04-13T17:08:35.663Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/33/c9a969e78dca323547276a6fee5f4f9588f7cd5ab45acec3778c67399589/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdf9bd863205590beaf8ef6e66f315310196632180dceaf674985d01a876cac3", size = 820864, upload-time = "2026-04-13T17:09:06.366Z" },
+    { url = "https://files.pythonhosted.org/packages/84/bd/6b9434b541fe55c125b5f2e017a565596a2d215aa09207e4555e4585064f/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59af8dbb683b24b90fb5b506de080faeab0a17a908e6c2a5d93a97260ed75d7b", size = 824060, upload-time = "2026-04-13T17:09:37.377Z" },
+    { url = "https://files.pythonhosted.org/packages/24/8d/871d5f8cf4c6f13987119fb0a9ae8be131e34f2756c2524e9974adf33824/fastar-0.11.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:9f3df73a3c4292cfe15696cdf59cdb6c309ab59d30b34c733be13c6e32d9a264", size = 889217, upload-time = "2026-04-13T17:08:50.884Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/26/cca0fd2704f3ed20165e5613ed911549aef3aaf3b0b5b02fee0e8e23e6cc/fastar-0.11.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:aa3762cbb16e41a76b61f4a6914937a71aab3a7b6c2d82ca233bc686ebaf756b", size = 975418, upload-time = "2026-04-13T17:10:24.307Z" },
+    { url = "https://files.pythonhosted.org/packages/99/94/8bbb0b13f5b6cbe2492f0b7cbba5103e6163976a3331466d010e781fa189/fastar-0.11.0-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:a8c7bc8ac74cb359bb546b199288c83236372d094b402e557c197e85527495cd", size = 1038492, upload-time = "2026-04-13T17:10:41.939Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/d3/5b7df222a30eac2822ffd00f82fd4c2ce84fba4b369d1e1a03732fd177fc/fastar-0.11.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:587cbd060a2699c5f66281081395bb4657b2b1e0eef5c206b1aabf740019d670", size = 1080210, upload-time = "2026-04-13T17:10:58.462Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/6d/56ef943ea524784598c035ccbd42e564e937da0438ae3f55f0e76cb95571/fastar-0.11.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6a1c56957ac82408be37a3f63594bc83e0919e8760492a4475e542f9f1828778", size = 1034886, upload-time = "2026-04-13T17:11:15.617Z" },
+]
+
 [[package]]
 name = "filelock"
 version = "3.20.1"
@@ -1314,6 +1755,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" },
 ]
 
+[[package]]
+name = "filetype"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
+]
+
+[[package]]
+name = "flashinfer-python"
+version = "0.5.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "apache-tvm-ffi" },
+    { name = "click" },
+    { name = "einops" },
+    { name = "ninja" },
+    { name = "numpy" },
+    { name = "nvidia-cudnn-frontend" },
+    { name = "nvidia-cutlass-dsl" },
+    { name = "nvidia-ml-py" },
+    { name = "packaging" },
+    { name = "requests" },
+    { name = "tabulate" },
+    { name = "torch" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b4/91/cca69baeff24bb3efd12c7479a026432c8717ee47193694010494c528b22/flashinfer_python-0.5.3.tar.gz", hash = "sha256:100d59b0ede47878d2808cd3a1b9039d7a952d66338bc9f68dac192ae1b2e3f1", size = 4682367, upload-time = "2025-11-20T21:22:46.976Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/78/6dc7e7da8cb87c9965644ea0d2439457a1bc9256c45ceda0044595be4143/flashinfer_python-0.5.3-py3-none-any.whl", hash = "sha256:b601293b72f9138bad173edc28df84b9f239a013be974e2e79d4ba98aeb38cf5", size = 6998069, upload-time = "2025-11-20T21:22:45.104Z" },
+]
+
 [[package]]
 name = "flask"
 version = "3.1.2"
@@ -1550,6 +2024,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/c5/8d2e1608644018232c77bf8d1e15525c307417a9cdefa3ed467aa9b39c04/geventhttpclient-2.3.7-cp312-cp312-win_amd64.whl", hash = "sha256:80199b556a6e226283a909a82090ed22408aa0572c8bfaa5d3c90aafa5df0a8b", size = 49008, upload-time = "2025-12-07T19:48:12.653Z" },
 ]
 
+[[package]]
+name = "gguf"
+version = "0.19.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/48/ae/17f1308ae45cd7b08ebb521747d5b23f4efc4d172038a4e228dd5106c3ff/gguf-0.19.0.tar.gz", hash = "sha256:dbadcd6cc7ccd44256f2229fe7c2dff5e8aa5cf0612ab987fd2b1a57e428923f", size = 111220, upload-time = "2026-05-06T13:04:03.667Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/bb/d71d6da82763528c2c2ed6b59a9d6142c6595545a4c448e2085d155e88c2/gguf-0.19.0-py3-none-any.whl", hash = "sha256:70bcd10edfe697fb2dad6e40af2234b9d8ece9a41a99761405121ebda1c3c1cd", size = 118475, upload-time = "2026-05-06T13:04:02.588Z" },
+]
+
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -1679,11 +2168,11 @@ wheels = [
 
 [[package]]
 name = "h11"
-version = "0.14.0"
+version = "0.16.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418, upload-time = "2022-09-25T15:40:01.519Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259, upload-time = "2022-09-25T15:39:59.68Z" },
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
 
 [[package]]
@@ -1703,15 +2192,44 @@ wheels = [
 
 [[package]]
 name = "httpcore"
-version = "1.0.8"
+version = "1.0.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "h11" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9f/45/ad3e1b4d448f22c0cff4f5692f5ed0666658578e358b8d58a19846048059/httpcore-1.0.8.tar.gz", hash = "sha256:86e94505ed24ea06514883fd44d2bc02d90e77e7979c8eb71b90f41d364a1bad", size = 85385, upload-time = "2025-04-11T14:42:46.661Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httptools"
+version = "0.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/e5/d471fcb0e14523fe1c3f4ba58ca52480e7bd70ad7109a3846bc75892f7fb/httptools-0.8.0.tar.gz", hash = "sha256:6b2a32f18d97e16e90827d7a819ffa8dbd8cc245fc4e1fa9d1095b54ef4bd999", size = 271342, upload-time = "2026-05-25T22:17:48.841Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/8d/f052b1e336bb2c1fc7ed1aaed898aa570c0b61a09707b108979d9fc6e308/httpcore-1.0.8-py3-none-any.whl", hash = "sha256:5254cf149bcb5f75e9d1b2b9f729ea4a4b883d1ad7379fc632b727cec23674be", size = 78732, upload-time = "2025-04-11T14:42:44.896Z" },
+    { url = "https://files.pythonhosted.org/packages/40/b9/be66eb0decd730d89b9c94f930e4b8d87787b05724bb84af98bfd825f72c/httptools-0.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bf3b6f807c8541503cecfbb8a8dffb385640d0d96102f3d112aa8740f9b7c826", size = 208805, upload-time = "2026-05-25T22:16:50.434Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/f7/b4d41eaae2869d31356bc4bbf546f44fae83ff298af0a043ca0625b06773/httptools-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:da684f2e1aa2ee9bdcb083f3f3a68c5956750b375bc5df864d3a5f0c42a40b77", size = 113527, upload-time = "2026-05-25T22:16:51.672Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/e4/77487e14fc7be47180fd0eb4267c7486d0cc59b74031839a3daf8650136b/httptools-0.8.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a6f21e2a3b0067bbe7f67e34cfd16276af556e5e52f4c7503be0cb5f90e905e4", size = 450035, upload-time = "2026-05-25T22:16:53.313Z" },
+    { url = "https://files.pythonhosted.org/packages/da/72/5a8f787e323f56fbd86c32a4be92a86776e4cfe8b4317db999f452028362/httptools-0.8.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea897f0c729581ebf72131a438a7932d9b14efef72d75ada966700cac3caaeb", size = 451101, upload-time = "2026-05-25T22:16:54.696Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/41/b44a25560955197674b6744cb903664300e239235a5eaa69df0890d87054/httptools-0.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c0d726cc107fceb7d45f978483b4b70dd8caa836f5914d3434bb18628eb73813", size = 436140, upload-time = "2026-05-25T22:16:56.239Z" },
+    { url = "https://files.pythonhosted.org/packages/74/b0/054aac84c03d7e097bf4c605fb7e74eec3d65c0276adf64ee97f3a103ff5/httptools-0.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9878eb2785ba5eb70631ad269b37976f73d647955e26c91d490eb8a4edfda4ba", size = 437041, upload-time = "2026-05-25T22:16:57.716Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/e8/86b85bbc0ac7892232f1a99ab96a9aa71936984fa06adfc0afc83ca7789e/httptools-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:b205e5f5523fa039679da0dfe5a10132b2a4abeae6a86fdd1ddc035f7f836557", size = 90454, upload-time = "2026-05-25T22:16:58.871Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/d2/c3eedaef57de65c3cc5f8dc244cf12d09c84ad258a479055aad6db23206c/httptools-0.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ed377e64805bdba4943c82717333f8f8603a13b09aff9cead2717c6c817fb168", size = 208428, upload-time = "2026-05-25T22:16:59.717Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/94/dfe435d90d0ef61ec0f2cc3d480eef78c59727c6c2ce039f433882f6131a/httptools-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9518c406d7b310f05adb1a37f80acabac40504a575d7c0da6d3e365c695ac20d", size = 113366, upload-time = "2026-05-25T22:17:00.795Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/d4/13025f1a56e615dcb331e0bbe2d9a1143212b58c263385fc5d2e558f5bac/httptools-0.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:57278e6fa0424c42a8a3e454828ab4f0aff27b40cddf9679579b98c6dce6a376", size = 464676, upload-time = "2026-05-25T22:17:02.014Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/95/4c1c26c0b985f8a3331682d802598f14e32dc41bf7509266eb2c04ad4801/httptools-0.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbb8caadb2b742d293169d2b458b5c001ef70e3158704aa3d3ef9597624c5d1d", size = 464235, upload-time = "2026-05-25T22:17:03.109Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/82/6735be2b0ca527718c431cdb8e5f70c3862c0844a687df0f572c51e11497/httptools-0.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:52dd695b865fe96d9d2b16b64a895f3f57bf3cb064e8383cd3b5713a069e8085", size = 449809, upload-time = "2026-05-25T22:17:04.443Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/f9/5811c74f37a758c8a4aa3dc430375119d335947e883efc4664d8f3559a41/httptools-0.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:20b4aac66ff65f7db06a375808b78f42a94970aa22e826b3cb2b43eb09174124", size = 452174, upload-time = "2026-05-25T22:17:05.476Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/94/97b75870dea07b71e3ec535cebe525b08d723152e4c7d13fa887e51f4de2/httptools-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:a1b4c8e7a489a0d750d91894e9a8cdc295838f1924c0ca903ae993456fddec07", size = 90991, upload-time = "2026-05-25T22:17:06.75Z" },
+    { url = "https://files.pythonhosted.org/packages/14/88/1d21a36da8f5cb0fa49eafd4b169eba5608d57e75bbcf61845cbc6243216/httptools-0.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:880490234c10f70a9830743097e8958d6e4b9f5a0ffc24515023afeef984054d", size = 208247, upload-time = "2026-05-25T22:17:07.843Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/42/cc4feea2945cb3051038f090c9b36bd5b8a9d7f5a894a506a8983e33fd1c/httptools-0.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5931891fb7b441b8a3853cf1b85c82c903defce084dd5f6771ca46e31bf862c5", size = 113064, upload-time = "2026-05-25T22:17:09.136Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/a6/febbb8b8db0f58b38e44ad6cb946e6a255ae49b55f2e8543408fb7501ccd/httptools-0.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b15fc622b0f869d19207c4089a501d9bcc63ca5e071ffdd2f03f922df882dcb2", size = 523851, upload-time = "2026-05-25T22:17:10.106Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/e4/f90a0df0b83beff265b7e3b65f2a4cefd95792d4be0ac3e16049f2acd3c2/httptools-0.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:425f83884fd6343828d8c565f046cb72b6d19063f6924093e11bcd8e1548cd09", size = 518842, upload-time = "2026-05-25T22:17:11.218Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/2d/0c9ac76dd2c893841fbf6498d6acec4f2442e1b7067f6e3e316a80e494e8/httptools-0.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7c3c97f4311c7be57e2986629df89d49cb434dbff78eafcd48c2bff986b15a", size = 501238, upload-time = "2026-05-25T22:17:12.728Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/42/906adc91ae3a5fa9c59c0a2f21c139725bd7e5b41ae6acd485cd14123ebf/httptools-0.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a1afd7c9fbff0d9f5d489c4ce2768bd09c84a46ddefc7161e6aa82ae35c85745", size = 509567, upload-time = "2026-05-25T22:17:13.842Z" },
+    { url = "https://files.pythonhosted.org/packages/05/0b/4240efeb672751ee5b9b380cb0e3fdc050bc05f68adc7a8aefc4fcd9a69a/httptools-0.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:cd96f29b4bab1d42fa6e3d008711c75e0f79e94e06827330160e3a304227f150", size = 90918, upload-time = "2026-05-25T22:17:15.155Z" },
 ]
 
 [[package]]
@@ -1806,6 +2324,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "interegular"
+version = "0.3.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/9d/8b6dde58a028a3962ce17e84d5fe73758df61378e00ef8ac3d85da34b0ff/interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600", size = 24705, upload-time = "2024-01-06T23:01:22.372Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635, upload-time = "2024-01-06T23:01:20.829Z" },
+]
+
 [[package]]
 name = "itsdangerous"
 version = "2.2.0"
@@ -1899,6 +2426,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
 ]
 
+[[package]]
+name = "jsonpatch"
+version = "1.33"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jsonpointer", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/78/18813351fe5d63acad16aec57f94ec2b70a09e53ca98145589e185423873/jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c", size = 21699, upload-time = "2023-06-26T12:07:29.144Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/07/02e16ed01e04a374e644b575638ec7987ae846d25ad97bcc9945a3ee4b0e/jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade", size = 12898, upload-time = "2023-06-16T21:01:28.466Z" },
+]
+
+[[package]]
+name = "jsonpointer"
+version = "3.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/c7/af399a2e7a67fd18d63c40c5e62d3af4e67b836a2107468b6a5ea24c4304/jsonpointer-3.1.1.tar.gz", hash = "sha256:0b801c7db33a904024f6004d526dcc53bbb8a4a0f4e32bfd10beadf60adf1900", size = 9068, upload-time = "2026-03-23T22:32:32.458Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/6a/a83720e953b1682d2d109d3c2dbb0bc9bf28cc1cbc205be4ef4be5da709d/jsonpointer-3.1.1-py3-none-any.whl", hash = "sha256:8ff8b95779d071ba472cf5bc913028df06031797532f08a7d5b602d8b2a488ca", size = 7659, upload-time = "2026-03-23T22:32:31.568Z" },
+]
+
 [[package]]
 name = "jsonschema"
 version = "4.25.1"
@@ -1998,6 +2546,82 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" },
 ]
 
+[[package]]
+name = "langchain-core"
+version = "1.2.31"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jsonpatch", marker = "python_full_version >= '3.11'" },
+    { name = "langsmith", marker = "python_full_version >= '3.11'" },
+    { name = "packaging", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
+    { name = "tenacity", marker = "python_full_version >= '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
+    { name = "uuid-utils", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/5a/7523ff55668a233beef7e909e8e2074a1cc3b620e0bbf0a4ec5f38549b3b/langchain_core-1.2.31.tar.gz", hash = "sha256:aad3ecc9e4dce2dd2bb79526c81b92e5322fd81db7834a031cb80359f2e3ebaa", size = 850756, upload-time = "2026-04-16T13:26:29.241Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/02/668ddf4f1cf963ad691bdbea672a85244e6271eb0a4acfaf662bbd94a3b1/langchain_core-1.2.31-py3-none-any.whl", hash = "sha256:c407193edb99311cc36ec3e4d3667a065bbc4d7d72fbb6e368538b9b134d4033", size = 513264, upload-time = "2026-04-16T13:26:27.566Z" },
+]
+
+[[package]]
+name = "langchain-nvidia-ai-endpoints"
+version = "1.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp", marker = "python_full_version >= '3.11'" },
+    { name = "filetype", marker = "python_full_version >= '3.11'" },
+    { name = "langchain-core", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8d/2e/0b3e6ec5df7426e3ab19c8dfedd0b4a9e97461a6a536e02f6429618664ec/langchain_nvidia_ai_endpoints-1.0.4.tar.gz", hash = "sha256:831decd67e94f104bc2fecc596ef2953ea30e7adc1c3b99bd35861e018dd1fb2", size = 46600, upload-time = "2026-02-13T17:17:56.135Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/3e/a711094b31777ac4a7993507b8a3e0a45307cbab94425b5eba012a49c0cd/langchain_nvidia_ai_endpoints-1.0.4-py3-none-any.whl", hash = "sha256:49018362fca9c951488dffcf3e1372365778946e2a3b87ff7d769589e7b3c497", size = 50173, upload-time = "2026-02-13T17:17:54.759Z" },
+]
+
+[[package]]
+name = "langchain-openai"
+version = "1.1.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain-core", marker = "python_full_version >= '3.11'" },
+    { name = "openai", version = "2.43.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "tiktoken", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8e/f5/b1a56f703fb90952b07ff9fb5507123a39df1267d62a7f2bb821c5dbb628/langchain_openai-1.1.14.tar.gz", hash = "sha256:71b4262932fabe506ce79c175dbc956cc48f24d81e20b27662df493147750643", size = 1115195, upload-time = "2026-04-16T14:55:24.696Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/fa/8c33befbc0cf81b21371cc1dab4e7bf94a80b8116194f263a5021ec02529/langchain_openai-1.1.14-py3-none-any.whl", hash = "sha256:cb525d2011f9813fc15a7dcfd4bca5b87badcbcb2c113a7fbe45d1b8a1bbb69c", size = 88705, upload-time = "2026-04-16T14:55:23.159Z" },
+]
+
+[[package]]
+name = "langsmith"
+version = "0.8.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "orjson", marker = "python_full_version >= '3.11' and platform_python_implementation != 'PyPy'" },
+    { name = "packaging", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "requests", marker = "python_full_version >= '3.11'" },
+    { name = "requests-toolbelt", marker = "python_full_version >= '3.11'" },
+    { name = "uuid-utils", marker = "python_full_version >= '3.11'" },
+    { name = "xxhash", marker = "python_full_version >= '3.11'" },
+    { name = "zstandard", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/17/eb/8883d1158c743d0aac350f09df7880714d27283497e8c80bb9fe3480f165/langsmith-0.8.5.tar.gz", hash = "sha256:3615243d99c12f4047f13042bdc05a373dce232d106a6511b3ca7b48c5af1c2c", size = 4462348, upload-time = "2026-05-15T21:31:41.093Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/23/85/968c88a63e32a59b3e5c68afd2fe114ce0708a125db0be1a85efc25fb2ea/langsmith-0.8.5-py3-none-any.whl", hash = "sha256:efc779f9d450dcaf9d97bc8894f4926276509d6e730e05289af9a64debce06ae", size = 399564, upload-time = "2026-05-15T21:31:39.046Z" },
+]
+
+[[package]]
+name = "lark"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/60/bc7622aefb2aee1c0b4ba23c1446d3e30225c8770b38d7aedbfb65ca9d5a/lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80", size = 252132, upload-time = "2024-08-13T19:49:00.652Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/00/d90b10b962b4277f5e64a78b6609968859ff86889f5b898c1a778c06ec00/lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c", size = 111036, upload-time = "2024-08-13T19:48:58.603Z" },
+]
+
 [[package]]
 name = "limits"
 version = "5.6.0"
@@ -2012,6 +2636,57 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/40/96/4fcd44aed47b8fcc457653b12915fcad192cd646510ef3f29fd216f4b0ab/limits-5.6.0-py3-none-any.whl", hash = "sha256:b585c2104274528536a5b68864ec3835602b3c4a802cd6aa0b07419798394021", size = 60604, upload-time = "2025-09-29T17:15:18.419Z" },
 ]
 
+[[package]]
+name = "llguidance"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" },
+    { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" },
+    { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" },
+]
+
+[[package]]
+name = "llvmlite"
+version = "0.44.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/89/6a/95a3d3610d5c75293d5dbbb2a76480d5d4eeba641557b69fe90af6c5b84e/llvmlite-0.44.0.tar.gz", hash = "sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4", size = 171880, upload-time = "2025-01-20T11:14:41.342Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/41/75/d4863ddfd8ab5f6e70f4504cf8cc37f4e986ec6910f4ef8502bb7d3c1c71/llvmlite-0.44.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:9fbadbfba8422123bab5535b293da1cf72f9f478a65645ecd73e781f962ca614", size = 28132306, upload-time = "2025-01-20T11:12:18.634Z" },
+    { url = "https://files.pythonhosted.org/packages/37/d9/6e8943e1515d2f1003e8278819ec03e4e653e2eeb71e4d00de6cfe59424e/llvmlite-0.44.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cccf8eb28f24840f2689fb1a45f9c0f7e582dd24e088dcf96e424834af11f791", size = 26201096, upload-time = "2025-01-20T11:12:24.544Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/46/8ffbc114def88cc698906bf5acab54ca9fdf9214fe04aed0e71731fb3688/llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7202b678cdf904823c764ee0fe2dfe38a76981f4c1e51715b4cb5abb6cf1d9e8", size = 42361859, upload-time = "2025-01-20T11:12:31.839Z" },
+    { url = "https://files.pythonhosted.org/packages/30/1c/9366b29ab050a726af13ebaae8d0dff00c3c58562261c79c635ad4f5eb71/llvmlite-0.44.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40526fb5e313d7b96bda4cbb2c85cd5374e04d80732dd36a282d72a560bb6408", size = 41184199, upload-time = "2025-01-20T11:12:40.049Z" },
+    { url = "https://files.pythonhosted.org/packages/69/07/35e7c594b021ecb1938540f5bce543ddd8713cff97f71d81f021221edc1b/llvmlite-0.44.0-cp310-cp310-win_amd64.whl", hash = "sha256:41e3839150db4330e1b2716c0be3b5c4672525b4c9005e17c7597f835f351ce2", size = 30332381, upload-time = "2025-01-20T11:12:47.054Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3", size = 28132305, upload-time = "2025-01-20T11:12:53.936Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427", size = 26201090, upload-time = "2025-01-20T11:12:59.847Z" },
+    { url = "https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1", size = 42361858, upload-time = "2025-01-20T11:13:07.623Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/7a/ce6174664b9077fc673d172e4c888cb0b128e707e306bc33fff8c2035f0d/llvmlite-0.44.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610", size = 41184200, upload-time = "2025-01-20T11:13:20.058Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl", hash = "sha256:d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955", size = 30331193, upload-time = "2025-01-20T11:13:26.976Z" },
+    { url = "https://files.pythonhosted.org/packages/15/86/e3c3195b92e6e492458f16d233e58a1a812aa2bfbef9bdd0fbafcec85c60/llvmlite-0.44.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad", size = 28132297, upload-time = "2025-01-20T11:13:32.57Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/53/373b6b8be67b9221d12b24125fd0ec56b1078b660eeae266ec388a6ac9a0/llvmlite-0.44.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db", size = 26201105, upload-time = "2025-01-20T11:13:38.744Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/da/8341fd3056419441286c8e26bf436923021005ece0bff5f41906476ae514/llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9", size = 42361901, upload-time = "2025-01-20T11:13:46.711Z" },
+    { url = "https://files.pythonhosted.org/packages/53/ad/d79349dc07b8a395a99153d7ce8b01d6fcdc9f8231355a5df55ded649b61/llvmlite-0.44.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d", size = 41184247, upload-time = "2025-01-20T11:13:56.159Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/3b/a9a17366af80127bd09decbe2a54d8974b6d8b274b39bf47fbaedeec6307/llvmlite-0.44.0-cp312-cp312-win_amd64.whl", hash = "sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1", size = 30332380, upload-time = "2025-01-20T11:14:02.442Z" },
+]
+
+[[package]]
+name = "lm-format-enforcer"
+version = "0.11.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "interegular" },
+    { name = "packaging" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/84/d5/41cd417ba7dfdbbcfe46cebf81fb3dfd7c591b89897560ad05bb410a465d/lm_format_enforcer-0.11.3.tar.gz", hash = "sha256:e68081c108719cce284a9bcc889709b26ffb085a1945b5eba3a12cfa96d528da", size = 40258, upload-time = "2025-08-24T19:37:47.527Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/ef/11292bb0b85cf4c93447cab5a29f64576ed14d3ab4280e35ddd23486594a/lm_format_enforcer-0.11.3-py3-none-any.whl", hash = "sha256:cf586350875def1ae7a8fba84fcbbfc8371424b6c9d05c1fcba70aa233fbf06f", size = 45418, upload-time = "2025-08-24T19:37:46.325Z" },
+]
+
 [[package]]
 name = "locust"
 version = "2.31.8"
@@ -2145,7 +2820,8 @@ dependencies = [
     { name = "kiwisolver" },
     { name = "numpy" },
     { name = "packaging" },
-    { name = "pillow" },
+    { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow", version = "12.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "pyparsing" },
     { name = "python-dateutil" },
 ]
@@ -2264,9 +2940,34 @@ spacy = [
     { name = "spacy" },
 ]
 
+[[package]]
+name = "mistral-common"
+version = "1.11.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jsonschema" },
+    { name = "numpy" },
+    { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow", version = "12.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "pydantic" },
+    { name = "pydantic-extra-types", extra = ["pycountry"] },
+    { name = "requests" },
+    { name = "tiktoken" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2e/03/3c5d4c9430da406f8444f9a7b058a6aa89c525fb068a57fe2ab8b04a6d08/mistral_common-1.11.3.tar.gz", hash = "sha256:6437e128fc8a307318440839ca14ddf2e8060056b062233ec0db10352651374c", size = 6360629, upload-time = "2026-06-04T09:01:11.131Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/76/dbfdf9c59e2a4b0116587626a3768c2a3b2ba1758b5756743918c2337fdc/mistral_common-1.11.3-py3-none-any.whl", hash = "sha256:dbfcef9d0c892727ee08a080f0c1039baed5430b291f5425ffd88892bf09e52c", size = 6533154, upload-time = "2026-06-04T09:01:14.186Z" },
+]
+
+[package.optional-dependencies]
+image = [
+    { name = "opencv-python-headless" },
+]
+
 [[package]]
 name = "mlflow"
-version = "2.16.2"
+version = "2.22.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "alembic" },
@@ -2287,33 +2988,104 @@ dependencies = [
     { name = "sqlalchemy" },
     { name = "waitress", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/4c/f7/2265f59a1422ec3952e100b0ac48e8a7c741f099cd3ed6eb32b738395a8c/mlflow-2.16.2.tar.gz", hash = "sha256:322512bcdd13d87039cd60ebcd4370ce16115fb5360905010978575202e57876", size = 26091357, upload-time = "2024-09-17T02:23:32.173Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/fa/c70189287a11b6ae32323957694d88c8811448b784203287ba7e384a87fc/mlflow-2.22.5.tar.gz", hash = "sha256:687c0fee93d25aee1b9537d0a83951daaa1f83bdf60658495e27992304cdcd51", size = 28378119, upload-time = "2026-05-12T08:50:29.355Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/0f/a1fbf8aa0040244a51464ca97fd554e1a8728727caf7e312f4b9b2f8b1fa/mlflow-2.16.2-py3-none-any.whl", hash = "sha256:7ed8f1d27e719a19592d9582e4415aa76abb3de53c524d6b9c66cbf5e00a1023", size = 26651842, upload-time = "2024-09-17T02:23:27.868Z" },
+    { url = "https://files.pythonhosted.org/packages/33/58/0d0f8080d78ba609ba43daa6b96150ec578c7186865e70947e1aeca7f6db/mlflow-2.22.5-py3-none-any.whl", hash = "sha256:5b95b5960e6726d0f9f7115b8593def2b339447d95d8a180caf84a3da121b407", size = 29004244, upload-time = "2026-05-12T08:50:25.36Z" },
 ]
 
 [[package]]
 name = "mlflow-skinny"
-version = "2.16.2"
+version = "2.22.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cachetools" },
     { name = "click" },
     { name = "cloudpickle" },
     { name = "databricks-sdk" },
+    { name = "fastapi" },
     { name = "gitpython" },
     { name = "importlib-metadata" },
     { name = "opentelemetry-api" },
     { name = "opentelemetry-sdk" },
     { name = "packaging" },
-    { name = "protobuf" },
+    { name = "protobuf", version = "4.25.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "protobuf", version = "5.29.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pydantic" },
     { name = "pyyaml" },
     { name = "requests" },
     { name = "sqlparse" },
+    { name = "typing-extensions" },
+    { name = "uvicorn" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/f04144318835c64d281c3c78a775630950696ad0d445c2c6827d2964f65f/mlflow_skinny-2.22.5.tar.gz", hash = "sha256:7aec51d79ae559c17bedec19005ed1293d9b56785b787a34c6e1fd6f755de0c4", size = 5892685, upload-time = "2026-05-12T08:50:12.26Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/39/206466eb32b1530846e5e026512e23c15a6c07a7da2ccae2a6152826d6cd/mlflow_skinny-2.22.5-py3-none-any.whl", hash = "sha256:c0e76ccd93f0ac97b0ed907ea7a404f1f53a260f52bf0b19425fc201c92b0188", size = 6270924, upload-time = "2026-05-12T08:50:09.201Z" },
+]
+
+[[package]]
+name = "mlx"
+version = "0.31.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mlx-metal", marker = "sys_platform == 'darwin'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/7c/c16d52494a1ba6d90443f31fa26bc810bf878d532dfa9a7a13f49ef9542d/mlx-0.31.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:b29cf940f34205f09bb552ac60465ae833c4ae640b52777c6d725ddbad8461ca", size = 586942, upload-time = "2026-04-22T03:14:21.97Z" },
+    { url = "https://files.pythonhosted.org/packages/74/da/1c7f3dc39b7bda65b0cafbaf1e58a35eea118622c6f4506c9a4294c9806e/mlx-0.31.2-cp310-cp310-macosx_15_0_arm64.whl", hash = "sha256:ebdc47b87b4b0216ceab3b5961716804bba3107c16454b65ae51d0e0c059f298", size = 586942, upload-time = "2026-04-22T03:14:23.527Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/e9/a8559389706d39f613620a8b6b42ed03cf3155a516b0762d355c5116fdab/mlx-0.31.2-cp310-cp310-macosx_26_0_arm64.whl", hash = "sha256:2a64db61b2840f28bae08354e6f999698e30381af201cc12354290673c96213b", size = 586804, upload-time = "2026-04-22T03:14:24.882Z" },
+    { url = "https://files.pythonhosted.org/packages/94/89/1e77ec3ff380e8fb9e7258047374d31452a0f9828a0e370f127b07dd8288/mlx-0.31.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4a3f181b367d404e44a6bd68ef5eb573930809ac60cacd51d0c851c629b1b651", size = 586911, upload-time = "2026-04-22T03:14:29.675Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/41/c1907f05f8a3fc54025fb78ad68d3c4a4b931664d03c0a24f7f431cc4087/mlx-0.31.2-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:70297cbef7479429f69c966bfed10da20a6f0c2aa997eec2b4f6ba1a07caf2ef", size = 586915, upload-time = "2026-04-22T03:14:31.403Z" },
+    { url = "https://files.pythonhosted.org/packages/97/b0/61ac2c14773c786fecbda28067b0207a0c654cb4d10c548808c51284d700/mlx-0.31.2-cp311-cp311-macosx_26_0_arm64.whl", hash = "sha256:c0ff158b7ac93a4b5659adbc70053498b30a5964fc45f78596398e056a96c36a", size = 587030, upload-time = "2026-04-22T03:14:32.961Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/47/5f33906cb03d6a378a697cd2d2641a26b37dea17ee3d9124d7e39e8eca01/mlx-0.31.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:e5067aaf2be1f3d7bba5be52348775804f111173c1ed04639618fd713b1a530f", size = 584863, upload-time = "2026-04-22T03:14:38.211Z" },
+    { url = "https://files.pythonhosted.org/packages/08/e7/a851a451b1327af9fb4df3991b9ae87d066b6f6630e854af55c288b0995a/mlx-0.31.2-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:edb9797db7d852477ca1c99708058654ee860d4148fe5765f0d55528e2b1aa22", size = 584860, upload-time = "2026-04-22T03:14:39.746Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/15/0d1dc0597644e5e7b011ca954ba0c47e13cd880a3b909b0c3f1b4d8bf8f1/mlx-0.31.2-cp312-cp312-macosx_26_0_arm64.whl", hash = "sha256:51ca102db641b01e7cb083ce8ecb580e281530a141a7ca12544bb370641630ae", size = 584887, upload-time = "2026-04-22T03:14:41.585Z" },
+]
+
+[[package]]
+name = "mlx-lm"
+version = "0.29.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jinja2", marker = "sys_platform != 'win32'" },
+    { name = "mlx", marker = "sys_platform == 'darwin'" },
+    { name = "numpy", marker = "sys_platform != 'win32'" },
+    { name = "protobuf", version = "4.25.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' and sys_platform != 'win32'" },
+    { name = "protobuf", version = "5.29.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'win32'" },
+    { name = "pyyaml", marker = "sys_platform != 'win32'" },
+    { name = "sentencepiece", marker = "sys_platform != 'win32'" },
+    { name = "transformers", marker = "sys_platform != 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5b/f1/4eedb6e55baa611724da356959ebbed9f1c4e41cc07d4cc0b9dca18cf10b/mlflow_skinny-2.16.2.tar.gz", hash = "sha256:c4064506ee8b590dea5dc3a139a890d50996d8ab511fbd34b0266bf69ebaa9d8", size = 5263742, upload-time = "2024-09-17T02:25:40.945Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/ba/2300ec30b6425507fface15601a6e20320ace45586d9a66339ac19644381/mlflow_skinny-2.16.2-py3-none-any.whl", hash = "sha256:c6faf8bddcba3d2bbde45c954c89575b93c4bef1d5e7e026d98fd9966015038c", size = 5599042, upload-time = "2024-09-17T02:25:38.388Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/53/913099c91d384e115ea078325efd9a0bc1ea3eb3458c694b4596cbd267f2/mlx_lm-0.29.1-py3-none-any.whl", hash = "sha256:440941b3054c2a2216e97615de584cc90fa1ea874782e20699b9895721fad8dc", size = 324884, upload-time = "2025-12-16T16:58:26.36Z" },
+]
+
+[[package]]
+name = "mlx-metal"
+version = "0.31.2"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/69/fe3b783ebe999f3118234e1e940feb622518bfb1dea6ac5d13b1d36a8449/mlx_metal-0.31.2-py3-none-macosx_14_0_arm64.whl", hash = "sha256:b25385bcee18fc194092255b8b53b9a3d8489eb650e59160f1b57aadd07aa2dc", size = 40055588, upload-time = "2026-04-22T03:14:14.43Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/5d/4c690d5b93c30ba002656c37363159d978705bf8eb801b8481840fb942c2/mlx_metal-0.31.2-py3-none-macosx_15_0_arm64.whl", hash = "sha256:e9d4e5fce6ca10a87a0e388597f99519ad594d09e674708b5312bd8bd4f5997d", size = 40053220, upload-time = "2026-04-22T03:14:18.048Z" },
+    { url = "https://files.pythonhosted.org/packages/99/82/11fd62a8d7a3e96e5c43220b17de0151e3f10101f8bb3b865f5bd9cdd074/mlx_metal-0.31.2-py3-none-macosx_26_0_arm64.whl", hash = "sha256:84ffb60ee503f03eb684f5fb168d5cff31e2a16b7f27c1731eaf7662bd6e9b46", size = 55792151, upload-time = "2026-04-22T03:14:22.059Z" },
+]
+
+[[package]]
+name = "model-hosting-container-standards"
+version = "0.1.16"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fastapi" },
+    { name = "httpx" },
+    { name = "jmespath" },
+    { name = "pydantic" },
+    { name = "setuptools" },
+    { name = "starlette" },
+    { name = "supervisor" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2d/5f/bc0d0fce1bd0a35378696aa13b21feffa18d9cda837f4e1be124e45ee090/model_hosting_container_standards-0.1.16.tar.gz", hash = "sha256:d34589633900e53c3ee5f7c78280a7cf7e4f6532c35e763341a262fc85cbe84a", size = 94130, upload-time = "2026-06-15T21:29:34.771Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/ef/6eabeb251d2a0598cb5f9a274159e05ae07a1e3fe6a1473bf6035793252a/model_hosting_container_standards-0.1.16-py3-none-any.whl", hash = "sha256:47f4f65713120bc3a69feb022981a38db9e557aedf88dbd72077f20588caa12b", size = 125666, upload-time = "2026-06-15T21:29:33.415Z" },
 ]
 
 [[package]]
@@ -2359,6 +3131,38 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c5/31/5b1a1f70eb0e87d1678e9624908f86317787b536060641d6798e3cf70ace/msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69", size = 64119, upload-time = "2025-10-08T09:15:13.589Z" },
 ]
 
+[[package]]
+name = "msgspec"
+version = "0.21.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/60/f79b9b013a16fa3a58350c9295ddc6789f2e335f36ea61ed10a21b215364/msgspec-0.21.1.tar.gz", hash = "sha256:2313508e394b0d208f8f56892ca9b2799e2561329de9763b19619595a6c0f72c", size = 319193, upload-time = "2026-04-12T21:44:50.394Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/38/d591d9f66d43d897ecbd249f2833665823d19c8b043f16619bc8343e23df/msgspec-0.21.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72d9cd03241b8b2edb2e12dcc66c500fa480d8cbd71a8bac105809d468882064", size = 195172, upload-time = "2026-04-12T21:43:45.062Z" },
+    { url = "https://files.pythonhosted.org/packages/69/1a/6899188b5982ec1324e0c629b7801eed2db987f6634fab58abd9fc82d317/msgspec-0.21.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed2ab278200e743a1d2610a4e0c8fc74f6cecb8548544cdec43f927bd9265238", size = 188316, upload-time = "2026-04-12T21:43:46.641Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/95/7e591b4fa11fdbbf9891164473c23420a8c781ef553295abe416bf335f42/msgspec-0.21.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd677e3001fdfed9186de72eab434da2976303cd5eb9550921d3d0c3e3e168ce", size = 216565, upload-time = "2026-04-12T21:43:48.081Z" },
+    { url = "https://files.pythonhosted.org/packages/19/86/714feeaf3b84cf2027235681725593840153dedd2868578f9f2715e296bb/msgspec-0.21.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f667b90b37fad734a91671abd68e0d7f4d066862771b87e91c53996dcb7a9027", size = 222689, upload-time = "2026-04-12T21:43:49.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/b9/4384243e814f2579e5205e17d170b9c1a30121afd1393298d904817a7fa7/msgspec-0.21.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:49880fd20fdbcfe1b793f07dd83f12572bab679c9800352c8b2240289aa46a06", size = 222343, upload-time = "2026-04-12T21:43:50.612Z" },
+    { url = "https://files.pythonhosted.org/packages/04/01/4b227d9c4057346271043632bad41979cf8c3dca372e41bb1f7d546395b2/msgspec-0.21.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae0162e22849a5e91eaad907766525107523b0daea3df267a9fcb5ba4e0936ae", size = 225607, upload-time = "2026-04-12T21:43:52.129Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ce/27021d1c3e5da837743092a7b7a5e8818397e1f4c05ee8b068bd7d1fd78a/msgspec-0.21.1-cp310-cp310-win_amd64.whl", hash = "sha256:f041a2279f31e3a53319005e4d60ba77c085cfcbe394cdc7ce803c2d01fe9449", size = 188392, upload-time = "2026-04-12T21:43:53.384Z" },
+    { url = "https://files.pythonhosted.org/packages/80/2b/daf7a8d6d7cf00e0dcd0439178b284ade701234abdcadf3385601da04fbd/msgspec-0.21.1-cp310-cp310-win_arm64.whl", hash = "sha256:1bf17cbd7b28a5dffc7e764c654eed8ccde5e0f1de7970628608304640d4ce4e", size = 174191, upload-time = "2026-04-12T21:43:54.6Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/7f/bbc4e74cd33d316b75541149e4d35b163b63bce066530ae185a2ec3b5bfc/msgspec-0.21.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b504b6e7f7a22a24b27232b73034421692147865162daaec9f3bf62439007c87", size = 193131, upload-time = "2026-04-12T21:43:56.094Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/60/504886af1aaf854112663b842d5eea9a15d9588f9bf7d0d2df736424b84d/msgspec-0.21.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4692b7c1609155708c4418f88e92f63c13fdf08aa095c84bae82bad75b53389b", size = 186597, upload-time = "2026-04-12T21:43:57.242Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/54/d24ddeaa65b5278c9e67f48ce3c17a9831e8f3722f3c8322ee120aca22ef/msgspec-0.21.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d3124010b3815451494c85ff345e693cb9fe5889cfcbbef39ed8622e0e72319c", size = 215158, upload-time = "2026-04-12T21:43:58.442Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/75/bb79c8b89a93ae23cd33c0d802373f16feaf9633f05d8af77091350dda0a/msgspec-0.21.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6badc03b9725352219cca017bfe71c61f2fbd0fb5982b410ac17c97c213deb30", size = 219856, upload-time = "2026-04-12T21:44:00.015Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/9c/c5ca26b46f0ebbd3a6683695ef89396712cb9e4199fd1f0bc1dd968216b1/msgspec-0.21.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5d2d4116ebe3035a78d9ec76e99a9d64e5fa6d44fe61a9c5de7fd1acf54bcc69", size = 220314, upload-time = "2026-04-12T21:44:01.548Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/31/645a351c4285dce40ed6755c3dcc0aa648e26dacb20a98018fe2cce5e87b/msgspec-0.21.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0d1009f6715f5bff3b54d4ff5c7428ad96197e0534e1645b8e9b955890c84664", size = 223215, upload-time = "2026-04-12T21:44:02.884Z" },
+    { url = "https://files.pythonhosted.org/packages/09/af/8bf15736a6dd3cb4f90c5467f6dc39197d2daaf10754490cdc0aa17b7312/msgspec-0.21.1-cp311-cp311-win_amd64.whl", hash = "sha256:c6faffe5bb644ec884052679af4dfd776d4b5ca90e4a7ec7e7e319e4e6b93a6e", size = 188554, upload-time = "2026-04-12T21:44:04.151Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/29/cc7db3a165b62d16e64a83f82eccb79655055cb5bc1f60459a6f9d7c82f2/msgspec-0.21.1-cp311-cp311-win_arm64.whl", hash = "sha256:ee9e3f11fa94603f7d673bf795cfa31b549c4a2c723bc39b45beb1e7f5a3fb99", size = 174517, upload-time = "2026-04-12T21:44:05.66Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/cf/317224852c00248c620a9bcf4b26e2e4ab8afd752f18d2a6ef73ebd423b6/msgspec-0.21.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4248cf0b6129b7d230eacd493c17cc2d4f3989f3bb7f633a928a85b7dcfa251", size = 196188, upload-time = "2026-04-12T21:44:07.181Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/81/074612945c0666078f7366f40000013de9f6ba687491d450df699bceebc9/msgspec-0.21.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5102c7e9b3acff82178449b85006d96310e690291bb1ea0142f1b24bcb8aabcb", size = 188473, upload-time = "2026-04-12T21:44:08.736Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/37/655101799590bcc5fddb2bd3fe0e6194e816c2d1da7c361725f5eb89a910/msgspec-0.21.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:846758412e9518252b2ac9bffd6f0e54d9ff614f5f9488df7749f81ff5c80920", size = 218871, upload-time = "2026-04-12T21:44:09.917Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/d1/d4cd9fe89c7d400d7a18f86ccc94daa3f0927f53558846fcb60791dce5d6/msgspec-0.21.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21995e74b5c598c2e004110ad66ec7f1b8c20bf2bcf3b2de8fd9a3094422d3ff", size = 225025, upload-time = "2026-04-12T21:44:11.191Z" },
+    { url = "https://files.pythonhosted.org/packages/24/bf/e20549e602b9edccadeeff98760345a416f9cce846a657e8b18e3396b212/msgspec-0.21.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6129f0cca52992e898fd5344187f7c8127b63d810b2fd73e36fca73b4c6475ee", size = 222672, upload-time = "2026-04-12T21:44:12.481Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/68/04d7a8f0f786545cf9b8c280c57aa6befb5977af6e884b8b54191cbe44b3/msgspec-0.21.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ef3ec2296248d1f8b9231acb051b6d471dfde8f21819e86c9adaaa9f42918521", size = 227303, upload-time = "2026-04-12T21:44:13.709Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/4d/619866af2840875be408047bf9e70ceafbae6ab50660de7134ed1b25eb86/msgspec-0.21.1-cp312-cp312-win_amd64.whl", hash = "sha256:d4ab834a054c6f0cbeef6df9e7e1b33d5f1bc7b86dea1d2fd7cad003873e783d", size = 190017, upload-time = "2026-04-12T21:44:14.977Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/2e/a8f9eca8fd00e097d7a9e99ba8a4685db994494448e3d4f0b7f6e9a3c0f7/msgspec-0.21.1-cp312-cp312-win_arm64.whl", hash = "sha256:628aaa35c74950a8c59da330d7e98917e1c7188f983745782027748ee4ca573e", size = 175345, upload-time = "2026-04-12T21:44:16.431Z" },
+]
+
 [[package]]
 name = "multidict"
 version = "6.7.0"
@@ -2547,6 +3351,59 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
 ]
 
+[[package]]
+name = "ninja"
+version = "1.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" },
+    { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" },
+    { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" },
+    { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" },
+    { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" },
+    { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" },
+    { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" },
+    { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" },
+    { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" },
+]
+
+[[package]]
+name = "numba"
+version = "0.61.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "llvmlite" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1c/a0/e21f57604304aa03ebb8e098429222722ad99176a4f979d34af1d1ee80da/numba-0.61.2.tar.gz", hash = "sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d", size = 2820615, upload-time = "2025-04-09T02:58:07.659Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/ca/f470be59552ccbf9531d2d383b67ae0b9b524d435fb4a0d229fef135116e/numba-0.61.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a", size = 2775663, upload-time = "2025-04-09T02:57:34.143Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/13/3bdf52609c80d460a3b4acfb9fdb3817e392875c0d6270cf3fd9546f138b/numba-0.61.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd", size = 2778344, upload-time = "2025-04-09T02:57:36.609Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/7d/bfb2805bcfbd479f04f835241ecf28519f6e3609912e3a985aed45e21370/numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8c7a522c26215d5f62ebec436e3d341f7f590079245a2f1008dfd498cc1642", size = 3824054, upload-time = "2025-04-09T02:57:38.162Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/27/797b2004745c92955470c73c82f0e300cf033c791f45bdecb4b33b12bdea/numba-0.61.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd1e74609855aa43661edffca37346e4e8462f6903889917e9f41db40907daa2", size = 3518531, upload-time = "2025-04-09T02:57:39.709Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/c6/c2fb11e50482cb310afae87a997707f6c7d8a48967b9696271347441f650/numba-0.61.2-cp310-cp310-win_amd64.whl", hash = "sha256:ae45830b129c6137294093b269ef0a22998ccc27bf7cf096ab8dcf7bca8946f9", size = 2831612, upload-time = "2025-04-09T02:57:41.559Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2", size = 2775825, upload-time = "2025-04-09T02:57:43.442Z" },
+    { url = "https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b", size = 2778695, upload-time = "2025-04-09T02:57:44.968Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60", size = 3829227, upload-time = "2025-04-09T02:57:46.63Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/06/66e99ae06507c31d15ff3ecd1f108f2f59e18b6e08662cd5f8a5853fbd18/numba-0.61.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbfdf4eca202cebade0b7d43896978e146f39398909a42941c9303f82f403a18", size = 3523422, upload-time = "2025-04-09T02:57:48.222Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl", hash = "sha256:76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1", size = 2831505, upload-time = "2025-04-09T02:57:50.108Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/a0/c6b7b9c615cfa3b98c4c63f4316e3f6b3bbe2387740277006551784218cd/numba-0.61.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2", size = 2776626, upload-time = "2025-04-09T02:57:51.857Z" },
+    { url = "https://files.pythonhosted.org/packages/92/4a/fe4e3c2ecad72d88f5f8cd04e7f7cff49e718398a2fac02d2947480a00ca/numba-0.61.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8", size = 2779287, upload-time = "2025-04-09T02:57:53.658Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/2d/e518df036feab381c23a624dac47f8445ac55686ec7f11083655eb707da3/numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546", size = 3885928, upload-time = "2025-04-09T02:57:55.206Z" },
+    { url = "https://files.pythonhosted.org/packages/10/0f/23cced68ead67b75d77cfcca3df4991d1855c897ee0ff3fe25a56ed82108/numba-0.61.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd", size = 3577115, upload-time = "2025-04-09T02:57:56.818Z" },
+    { url = "https://files.pythonhosted.org/packages/68/1d/ddb3e704c5a8fb90142bf9dc195c27db02a08a99f037395503bfbc1d14b3/numba-0.61.2-cp312-cp312-win_amd64.whl", hash = "sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18", size = 2831929, upload-time = "2025-04-09T02:57:58.45Z" },
+]
+
 [[package]]
 name = "numpy"
 version = "2.2.6"
@@ -2632,6 +3489,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
 ]
 
+[[package]]
+name = "nvidia-cudnn-frontend"
+version = "1.25.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/58/5a853819023c1ea17b1e71363a1123bd9d9b1a31b41c80adff07a08d32d1/nvidia_cudnn_frontend-1.25.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:04c48329eb6918a92e83905981a02dc8f1817dc570720e5531adf053d04a4956", size = 3261918, upload-time = "2026-06-10T21:05:37.3Z" },
+    { url = "https://files.pythonhosted.org/packages/63/46/fa9f68a9936d97498bc2c6d1f34e7dcbf2dd3a40bf7dd3f64461b9293f2e/nvidia_cudnn_frontend-1.25.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92ccf81aad15764b67263901a733505d21772431675a56526e0a17c3f5a3674b", size = 3411836, upload-time = "2026-06-10T21:05:57.444Z" },
+    { url = "https://files.pythonhosted.org/packages/02/19/4f36705f2c9a22733fc3dcd31d2e40cd422e614899a76030ada34c83255e/nvidia_cudnn_frontend-1.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:2684f5f33a41f4cf3505f16c3adf59c0e43e8c4c79442c8c508da0e29ea3637c", size = 2796177, upload-time = "2026-06-10T21:06:18.372Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/57/5f2a32a40f7beeaec4020b7124ea854ba38ecb89663ba3449b42bb88ad54/nvidia_cudnn_frontend-1.25.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ae5c281bcb23536c12b7fd2b28e2f599dd1e45e96d37b598175195eb75e8f1a", size = 3262531, upload-time = "2026-06-10T21:06:43.49Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/50/224ff36c5d9e02624f8d3c582982bfac74bec481cd331e704fb9a5ecd128/nvidia_cudnn_frontend-1.25.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676d56062d3ade4ffb34315abe52ea766fa4488db1161b702d9ddd872fab4ddf", size = 3413687, upload-time = "2026-06-10T21:07:04.26Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/87/4716b610e0f5b695f76984cb7591944f2d72b10139ca952f3d0cd1cd9ea3/nvidia_cudnn_frontend-1.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:05279eac512e923fc61154f5d463d9917f14d46aa7a507e2610458e1d2367f3b", size = 2797009, upload-time = "2026-06-10T21:07:27.112Z" },
+    { url = "https://files.pythonhosted.org/packages/28/0f/df39a194f2529093db737d43cc4cbf594c6a79712a09aa104b999e4d95d4/nvidia_cudnn_frontend-1.25.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09e6e1bc48ce1235743f89d8ea699c52b3008fd6dae7f2ecadb744bebf272a2b", size = 3263306, upload-time = "2026-06-10T21:07:48.093Z" },
+    { url = "https://files.pythonhosted.org/packages/03/65/3b45941d8a22128b971e910f2e9af6bf5ef453e92cc329c56b6eb53c53de/nvidia_cudnn_frontend-1.25.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9a94a72d736bd79eb35f451aaf26d9493778e02ecabccc92c05425508c9e7a83", size = 3414884, upload-time = "2026-06-10T21:08:08.603Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/45/69517e8f028573a150e82b71205c920e78ebbe83ff0d073eaeee2ada18dc/nvidia_cudnn_frontend-1.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:d1bfdc795a8bda570ca80ef2287e83f00974857a9a086c1653d2a28099496fee", size = 2798190, upload-time = "2026-06-10T21:08:30.506Z" },
+]
+
 [[package]]
 name = "nvidia-cufft-cu12"
 version = "11.3.3.83"
@@ -2691,6 +3564,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
 ]
 
+[[package]]
+name = "nvidia-cutlass-dsl"
+version = "4.5.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-cutlass-dsl-libs-base" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f0/15/575d7df4fe2f3406f1cfc68be72aeff2834f8a696daf1cd5bee8017e4507/nvidia_cutlass_dsl-4.5.2-py3-none-any.whl", hash = "sha256:68ed1b63ca74aae87955012da9dfd7fdaae471329d0028b229b841c7192ccf52", size = 10179, upload-time = "2026-05-25T03:38:56.364Z" },
+]
+
+[[package]]
+name = "nvidia-cutlass-dsl-libs-base"
+version = "4.5.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cuda-python" },
+    { name = "numpy" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fd/3e/2cca8745885aaba0d835a8be29e516e56930791c01f0806da95d3017a495/nvidia_cutlass_dsl_libs_base-4.5.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b62807bc5ea13bbdef648212893fac407ed943f940cece56b880d44af243e075", size = 75635922, upload-time = "2026-05-25T03:46:33.526Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/2b/4de80442d33791322aa496e2a7f47ed08a42578bd1c7031ef0602009f8ad/nvidia_cutlass_dsl_libs_base-4.5.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:386e832427e3670479049a1560e4d8d2e565d8c0f37a6852c6d7043d046548f1", size = 74512458, upload-time = "2026-05-25T03:49:47.052Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/a8/0cca1d11787128c66c0774374d1bb09313352eee11560dd00f36d6d62f36/nvidia_cutlass_dsl_libs_base-4.5.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:cbb555a95c7011e4b3ca328be407299c77d289660adbea22ed515d4406e6949c", size = 75637009, upload-time = "2026-05-25T03:48:37.901Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/e0/78eded54b4478ec01a91c75f1b9bc6dc73a2ec205c4fa2fdc25a456f4089/nvidia_cutlass_dsl_libs_base-4.5.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:9117900cba53d3c21a8dacba6bbf3d6e5f269e427a526c320fb44707a0d57363", size = 74511501, upload-time = "2026-05-25T03:52:03.798Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/ef/e827e3c67d72adbf4e8f680bdf03b1b67723d9e1ae7c3d0a1751f39f69ce/nvidia_cutlass_dsl_libs_base-4.5.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d2a3c412287e356fbe48fe9f845d6d33cd35dea5e20d7e4f628c20957967cacd", size = 75643473, upload-time = "2026-05-25T03:49:15.857Z" },
+    { url = "https://files.pythonhosted.org/packages/97/68/c1247ab848f26c4ab56e562eea0e3f31fc14c9aaf0d883afaa92d8f05592/nvidia_cutlass_dsl_libs_base-4.5.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:15ef6a59193667e663934ef4873f8ccad37455e9b7c3c419c3072113b8aedf61", size = 74513226, upload-time = "2026-05-25T03:51:32.496Z" },
+]
+
+[[package]]
+name = "nvidia-ml-py"
+version = "13.610.43"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/b5/a8fbc356f768fa5c9cfd646668fd7d34bf55bdd1c6e20754642a64d930d4/nvidia_ml_py-13.610.43.tar.gz", hash = "sha256:65437eb73d68d0c62c931ca4d45038472faff03bd0b8729abba4b899f70d60f2", size = 52109, upload-time = "2026-06-01T18:54:08.829Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/23/45/caa600acfab94560807a20a64b5830d2cd3c3202b7f1328644d70b7d6bd8/nvidia_ml_py-13.610.43-py3-none-any.whl", hash = "sha256:f13c72698edef492f985cc225f14faafe68ae065a2e407f45bdf6f4b9b43fde8", size = 53163, upload-time = "2026-06-01T18:54:07.704Z" },
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.27.5"
@@ -2712,34 +3623,119 @@ name = "nvidia-nvshmem-cu12"
 version = "3.3.20"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.8.90"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
+]
+
+[[package]]
+name = "ollama"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/72/5f12423b6b39ca8430fbe56f77fcf4ef60f63067c7c4a2e30e200ed9ec16/ollama-0.6.2.tar.gz", hash = "sha256:936d55daa684f474364c098611c933626f8d6c7d67065c5b7ae0c477b508b07f", size = 53145, upload-time = "2026-04-29T21:21:15.018Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/ab/d6722beeb2d10f7a3b9ff49375708904fde18f82b5609a0bc4aeb5996a4d/ollama-0.6.2-py3-none-any.whl", hash = "sha256:3ad7daab28e5a973445c36a73882a3ef698c2ebb00e21e308652741577509f7d", size = 15115, upload-time = "2026-04-29T21:21:13.794Z" },
+]
+
+[[package]]
+name = "openai"
+version = "2.13.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+]
+dependencies = [
+    { name = "anyio", marker = "python_full_version < '3.11'" },
+    { name = "distro", marker = "python_full_version < '3.11'" },
+    { name = "httpx", marker = "python_full_version < '3.11'" },
+    { name = "jiter", marker = "python_full_version < '3.11'" },
+    { name = "pydantic", marker = "python_full_version < '3.11'" },
+    { name = "sniffio", marker = "python_full_version < '3.11'" },
+    { name = "tqdm", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0f/39/8e347e9fda125324d253084bb1b82407e5e3c7777a03dc398f79b2d95626/openai-2.13.0.tar.gz", hash = "sha256:9ff633b07a19469ec476b1e2b5b26c5ef700886524a7a72f65e6f0b5203142d5", size = 626583, upload-time = "2025-12-16T18:19:44.387Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/d5/eb52edff49d3d5ea116e225538c118699ddeb7c29fa17ec28af14bc10033/openai-2.13.0-py3-none-any.whl", hash = "sha256:746521065fed68df2f9c2d85613bb50844343ea81f60009b60e6a600c9352c79", size = 1066837, upload-time = "2025-12-16T18:19:43.124Z" },
+]
+
+[[package]]
+name = "openai"
+version = "2.43.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform != 'win32'",
+    "python_full_version == '3.11' and sys_platform != 'win32'",
+    "python_full_version >= '3.12' and sys_platform == 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform == 'win32'",
+    "python_full_version == '3.11' and sys_platform == 'win32'",
+]
+dependencies = [
+    { name = "anyio", marker = "python_full_version >= '3.11'" },
+    { name = "distro", marker = "python_full_version >= '3.11'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "jiter", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "sniffio", marker = "python_full_version >= '3.11'" },
+    { name = "tqdm", marker = "python_full_version >= '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/fa/88d0c58a0c58df7e6758e66b99c5d028d5e0bb49f8812d7203940cd9dbf1/openai-2.43.0.tar.gz", hash = "sha256:e74d238200a26868977002190fb6631613480a93dfe0c9c982e77021ed60a017", size = 785369, upload-time = "2026-06-17T17:06:56.06Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a3/d2/ba767f4bbb30776c03d40906a2d3afad716a165ffa1771fc23b8992f7920/openai-2.43.0-py3-none-any.whl", hash = "sha256:65a670b54fadf2268c9e1330133373c963eb779ee969e5cbad419ec2c21dce97", size = 1355077, upload-time = "2026-06-17T17:06:53.614Z" },
 ]
 
 [[package]]
-name = "nvidia-nvtx-cu12"
-version = "12.8.90"
+name = "openai-harmony"
+version = "0.0.8"
 source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3e/92/2d038d096f29179c7c9571b431f9e739f87a487121901725e23fe338dd9d/openai_harmony-0.0.8.tar.gz", hash = "sha256:6e43f98e6c242fa2de6f8ea12eab24af63fa2ed3e89c06341fb9d92632c5cbdf", size = 284777, upload-time = "2025-11-05T19:07:06.727Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
+    { url = "https://files.pythonhosted.org/packages/45/c6/2502f416d46be3ec08bb66d696cccffb57781a499e3ff2e4d7c174af4e8f/openai_harmony-0.0.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:029ec25ca74abe48fdb58eb9fdd2a8c1618581fc33ce8e5653f8a1ffbfbd9326", size = 2627806, upload-time = "2025-11-05T19:06:57.063Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/d2/ce6953ca87db9cae3e775024184da7d1c5cb88cead19a2d75b42f00a959c/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4f709815924ec325b9a890e6ab2bbb0ceec8e319a4e257328eb752cf36b2efc", size = 2948463, upload-time = "2025-11-05T19:06:48.17Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/4c/b553c9651662d6ce102ca7f3629d268b23df1abe5841e24bed81e8a8e949/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5cfcfd963b50a41fc656c84d3440ca6eecdccd6c552158ce790b8f2e33dfb5a9", size = 2704083, upload-time = "2025-11-05T19:06:50.205Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/af/4eec8f9ab9c27bcdb444460c72cf43011d176fc44c79d6e113094ca1e152/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a3a16972aa1cee38ea958470cd04ac9a2d5ac38fdcf77ab686611246220c158", size = 2959765, upload-time = "2025-11-05T19:06:53.62Z" },
+    { url = "https://files.pythonhosted.org/packages/11/3c/33f3374e4624e0e776f6b13b73c45a7ead7f9c4529f8369ed5bfcaa30cac/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b4d5cfa168e74d08f8ba6d58a7e49bc7daef4d58951ec69b66b0d56f4927a68d", size = 3427031, upload-time = "2025-11-05T19:06:51.829Z" },
+    { url = "https://files.pythonhosted.org/packages/25/3f/1a192b93bb47c6b44cd98ba8cc1d3d2a9308f1bb700c3017e6352da11bda/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c007d277218a50db8839e599ed78e0fffe5130f614c3f6d93ae257f282071a29", size = 2953260, upload-time = "2025-11-05T19:06:55.406Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/f8/93b582cad3531797c3db7c2db5400fd841538ccddfd9f5e3df61be99a630/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:8565d4f5a0638da1bffde29832ed63c9e695c558611053add3b2dc0b56c92dbc", size = 3127044, upload-time = "2025-11-05T19:06:59.553Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/10/4327dbf87f75ae813405fd9a9b4a5cde63d506ffed0a096a440a4cabd89c/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:cbaa3bda75ef0d8836e1f8cc84af62f971b1d756d740efc95c38c3e04c0bfde2", size = 2932931, upload-time = "2025-11-05T19:07:01.437Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/c8/1774eec4f6f360ef57618fb8f52e3d3af245b2491bd0297513aa09eec04b/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:772922a9bd24e133950fad71eb1550836f415a88e8c77870e12d0c3bd688ddc2", size = 2996140, upload-time = "2025-11-05T19:07:03.438Z" },
+    { url = "https://files.pythonhosted.org/packages/60/c3/3d1e01e2dba517a91760e4a03e4f20ffc75039a6fe584d0e6f9b5c78fd15/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:007b0476a1f331f8130783f901f1da6f5a7057af1a4891f1b6a31dec364189b5", size = 3205080, upload-time = "2025-11-05T19:07:05.078Z" },
+    { url = "https://files.pythonhosted.org/packages/14/63/119de431572d7c70a7bf1037034a9be6ed0a7502a7498ba7302bca5b3242/openai_harmony-0.0.8-cp38-abi3-win32.whl", hash = "sha256:a9b5f893326b28d9e935ade14b4f655f5a840942473bc89b201c25f7a15af9cf", size = 2082457, upload-time = "2025-11-05T19:07:09.631Z" },
+    { url = "https://files.pythonhosted.org/packages/40/1f/c83cf5a206c263ee70448a5ae4264682555f4d0b5bed0d2cc6ca1108103d/openai_harmony-0.0.8-cp38-abi3-win_amd64.whl", hash = "sha256:39d44f0d8f466bd56698e7ead708bead3141e27b9b87e3ab7d5a6d0e4a869ee5", size = 2438369, upload-time = "2025-11-05T19:07:08.1Z" },
 ]
 
 [[package]]
-name = "openai"
-version = "2.12.0"
+name = "opencv-python-headless"
+version = "4.13.0.92"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "distro" },
-    { name = "httpx" },
-    { name = "jiter" },
-    { name = "pydantic" },
-    { name = "sniffio" },
-    { name = "tqdm" },
-    { name = "typing-extensions" },
+    { name = "numpy" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/86/f9/fb8abeb4cdba6f24daf3d7781f42ceb1be1ff579eb20705899e617dd95f1/openai-2.12.0.tar.gz", hash = "sha256:cc6dcbcb8bccf05976d983f6516c5c1f447b71c747720f1530b61e8f858bcbc9", size = 626183, upload-time = "2025-12-15T16:17:15.097Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c3/a1/f055214448cb4b176e89459d889af9615fe7d927634fb5a2cecfb7674bc5/openai-2.12.0-py3-none-any.whl", hash = "sha256:7177998ce49ba3f90bcce8b5769a6666d90b1f328f0518d913aaec701271485a", size = 1066590, upload-time = "2025-12-15T16:17:13.301Z" },
+    { url = "https://files.pythonhosted.org/packages/79/42/2310883be3b8826ac58c3f2787b9358a2d46923d61f88fedf930bc59c60c/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:1a7d040ac656c11b8c38677cc8cccdc149f98535089dbe5b081e80a4e5903209", size = 46247192, upload-time = "2026-02-05T07:01:35.187Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/1e/6f9e38005a6f7f22af785df42a43139d0e20f169eb5787ce8be37ee7fcc9/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_14_0_x86_64.whl", hash = "sha256:3e0a6f0a37994ec6ce5f59e936be21d5d6384a4556f2d2da9c2f9c5dc948394c", size = 32568914, upload-time = "2026-02-05T07:01:51.989Z" },
+    { url = "https://files.pythonhosted.org/packages/21/76/9417a6aef9def70e467a5bf560579f816148a4c658b7d525581b356eda9e/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c8cfc8e87ed452b5cecb9419473ee5560a989859fe1d10d1ce11ae87b09a2cb", size = 33703709, upload-time = "2026-02-05T10:24:46.469Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ce/bd17ff5772938267fd49716e94ca24f616ff4cb1ff4c6be13085108037be/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0525a3d2c0b46c611e2130b5fdebc94cf404845d8fa64d2f3a3b679572a5bd22", size = 56016764, upload-time = "2026-02-05T10:26:48.904Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/b4/b7bcbf7c874665825a8c8e1097e93ea25d1f1d210a3e20d4451d01da30aa/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eb60e36b237b1ebd40a912da5384b348df8ed534f6f644d8e0b4f103e272ba7d", size = 35010236, upload-time = "2026-02-05T10:28:11.031Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/33/b5db29a6c00eb8f50708110d8d453747ca125c8b805bc437b289dbdcc057/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0bd48544f77c68b2941392fcdf9bcd2b9cdf00e98cb8c29b2455d194763cf99e", size = 60391106, upload-time = "2026-02-05T10:30:14.236Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/c3/52cfea47cd33e53e8c0fbd6e7c800b457245c1fda7d61660b4ffe9596a7f/opencv_python_headless-4.13.0.92-cp37-abi3-win32.whl", hash = "sha256:a7cf08e5b191f4ebb530791acc0825a7986e0d0dee2a3c491184bd8599848a4b", size = 30812232, upload-time = "2026-02-05T07:02:29.594Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/90/b338326131ccb2aaa3c2c85d00f41822c0050139a4bfe723cfd95455bd2d/opencv_python_headless-4.13.0.92-cp37-abi3-win_amd64.whl", hash = "sha256:77a82fe35ddcec0f62c15f2ba8a12ecc2ed4207c17b0902c7a3151ae29f37fb6", size = 40070414, upload-time = "2026-02-05T07:02:26.448Z" },
 ]
 
 [[package]]
@@ -2782,6 +3778,89 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/00/c2/ca5cef8e4cd8eec5a95deed95ec3f6005e499fd9d17ca08731ced03a6921/opentelemetry_semantic_conventions-0.47b0-py3-none-any.whl", hash = "sha256:4ff9d595b85a59c1c1413f02bba320ce7ea6bf9e2ead2b0913c4395c7bbc1063", size = 138027, upload-time = "2024-07-25T04:02:01.7Z" },
 ]
 
+[[package]]
+name = "orjson"
+version = "3.11.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/0c/964746fcafbd16f8ff53219ad9f6b412b34f345c75f384ad434ceaadb538/orjson-3.11.9.tar.gz", hash = "sha256:4fef17e1f8722c11587a6ef18e35902450221da0028e65dbaaa543619e68e48f", size = 5599163, upload-time = "2026-05-06T15:11:08.309Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/5d/b95ca542a001135cc250a49370f282f578c8f4e46cc8617d73775297eea8/orjson-3.11.9-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:135869ef917b8704ea0a94e01620e0c05021c15c52036e4663baffe75e72f8ce", size = 228986, upload-time = "2026-05-06T15:09:14.765Z" },
+    { url = "https://files.pythonhosted.org/packages/80/01/be33fbff646e22f93398429ea645f20d2097aea1a6cdc1e6628e70125f83/orjson-3.11.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:115ab5f5f4a0f203cc2a5f0fb09aee503a3f771aa08392949ab5ca230c4fbdbd", size = 132558, upload-time = "2026-05-06T15:09:17.431Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/61/73d49333bba660a075daccca10970dc6409ce1cf42ae4046646a19468aad/orjson-3.11.9-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4da3c38a2083ca4aaf9c2a36776cce3e9328e6647b10d118948f3cfb4913ffe4", size = 128213, upload-time = "2026-05-06T15:09:18.719Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/7d/30e844b3dac3f74aed66b1f984daf9db3c98c0328c03d965a9e8dc06449e/orjson-3.11.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53b50b0e14084b8f7e29c5ce84c5af0f1160169b30d8a6914231d97d2fe297d4", size = 135430, upload-time = "2026-05-06T15:09:20.257Z" },
+    { url = "https://files.pythonhosted.org/packages/16/64/bd815f5c610b3facc204f26ba94e87a9eb49b0d83de3d5fc1eee2402d91b/orjson-3.11.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:231742b4a11dad8d5380a435962c57e91b7c37b79be858f4ef1c0df1a259897e", size = 146178, upload-time = "2026-05-06T15:09:21.616Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/35/e744fd36c79b339d27beb06068b5a08a8882ef5418804d0ce545a31f718d/orjson-3.11.9-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34fd2317602587321faab75ab76c623a0117e80841a6413654f04e47f339a8fb", size = 133068, upload-time = "2026-05-06T15:09:23.228Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/56/d54152b67b63a0b3e556cfc549d6ce84f74d7f425ddeadc6c8a74d913da7/orjson-3.11.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71f3db16e69b667b132e0f305a833d5497da302d801508cbb051ed9a9819da47", size = 134217, upload-time = "2026-05-06T15:09:24.847Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/ee/66154baf69f71c7164a268a5e888908aec5a0819d13c81d5e2755a257758/orjson-3.11.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0b34789fa0da61cf7bef0546b09c738fb195331e017e477096d129e9105ab03d", size = 141917, upload-time = "2026-05-06T15:09:26.647Z" },
+    { url = "https://files.pythonhosted.org/packages/09/d3/c5824260ca8b9d7ba82648d042a3f8f4815d18c15bb98a1f30edd1bb2d83/orjson-3.11.9-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:87e4d4ab280b0c87424d47695bec2182caf8cfc17879ea78dab76680194abc13", size = 415356, upload-time = "2026-05-06T15:09:28.252Z" },
+    { url = "https://files.pythonhosted.org/packages/64/cb/509c2e816fe4df641d93dc92f6a89adc8df3ada8ebdee2bd44aba3264c3c/orjson-3.11.9-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ace6c58523302d3b97b6ac5c38a5298a54b473762b6be82726b4265c41029f92", size = 148112, upload-time = "2026-05-06T15:09:29.783Z" },
+    { url = "https://files.pythonhosted.org/packages/db/b5/3ceae56d2e4962979eedb023ba6a46a4bb65f333960379be0ca470686220/orjson-3.11.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:97d0d932803c1b164fde11cb542a9efcb1e0f63b184537cca65887147906ff48", size = 137112, upload-time = "2026-05-06T15:09:31.432Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/7a/81fa3f2c7bef79b04cf2ab7838e5ac74b1f12511ceab979759b0275d6bb4/orjson-3.11.9-cp310-cp310-win32.whl", hash = "sha256:b3afcf569c15577a9fe64627292daa3e6b3a70f4fb77a5df246a87ec21681b94", size = 131706, upload-time = "2026-05-06T15:09:32.707Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/d8/b64600f9083c7f151ad39717a5877fccbeb0ef6d7efcb55f971ce00b6bee/orjson-3.11.9-cp310-cp310-win_amd64.whl", hash = "sha256:8697ab6a080a5c46edaad50e2bc5bd8c7ca5c66442d24104fa44ec74910a8244", size = 127282, upload-time = "2026-05-06T15:09:33.955Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/51/3fb9e65ae76ee97bd611869a503fa3fc0a6e81dd8b737cf3003f682df7ff/orjson-3.11.9-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:f01c4818b3fc9b0da8e096722a84318071eaa118df35f6ed2344da0e73a5444f", size = 228522, upload-time = "2026-05-06T15:09:35.362Z" },
+    { url = "https://files.pythonhosted.org/packages/16/fa/9d54b07cb3f3b0bfd57841478e42d7a0ece4a9f49f9907eecf5a45461687/orjson-3.11.9-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:3ebca4179031ee716ed076ffadc29428e900512f6fccee8614c9983157fcf19c", size = 128463, upload-time = "2026-05-06T15:09:37.063Z" },
+    { url = "https://files.pythonhosted.org/packages/88/b1/6ceafc2eefd0a553e3be77ce6c49d107e772485d9568629376171c50e634/orjson-3.11.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48ee05097750de0ff69ed5b7bbcf0732182fd57a24043dcc2a1da780a5ead3a5", size = 132306, upload-time = "2026-05-06T15:09:38.299Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/76/f11311285324a40aab1e3031385c50b635a7cd0734fdaf60c7e89a696f60/orjson-3.11.9-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6082706765a95a6680d812e1daf1c0cfe8adec7831b3ff3b625693f3b461b1c", size = 127988, upload-time = "2026-05-06T15:09:39.597Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/85/0ef63bcf1337f44031ce9b91b1919563f62a37527b3ea4368bb15a22e5d7/orjson-3.11.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:277fefe9d76ee17eb14debf399e3533d4d63b5f677a4d3719eb763536af1f4bd", size = 135188, upload-time = "2026-05-06T15:09:40.957Z" },
+    { url = "https://files.pythonhosted.org/packages/05/94/b0d27090ea8a2095db3c2bd1b1c96f96f19bbb494d7fef33130e846e613d/orjson-3.11.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:03db380e3780fa0015ed776a90f20e8e20bb11dde13b216ce19e5718e3dfba62", size = 145937, upload-time = "2026-05-06T15:09:42.249Z" },
+    { url = "https://files.pythonhosted.org/packages/09/eb/75d50c29c05b8054013e221e598820a365c8e64065312e75e202ed880709/orjson-3.11.9-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33d7d766701847dc6729846362dc27895d2f2d2251264f9d10e7cb9878194877", size = 132758, upload-time = "2026-05-06T15:09:43.945Z" },
+    { url = "https://files.pythonhosted.org/packages/49/bd/360686f39348aa88827cb6fbf7dc606fd41c831a35235e1abf1db8e3a9e6/orjson-3.11.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:147302878da387104b66bb4a8b0227d1d487e976ce41a8501916161072ed87b1", size = 133971, upload-time = "2026-05-06T15:09:45.239Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/30/3178eb16f3221aeef068b6f1f1ebe05f656ea5c6dffe9f6c917329fe17a3/orjson-3.11.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3513550321f8c8c811a7c3297b8a630e82dc08e4c10216d07703c997776236cd", size = 141685, upload-time = "2026-05-06T15:09:46.858Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/f1/ff2f19ed0225f9680fafa42febca3570dd59444ebf190980738d376214c2/orjson-3.11.9-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:c5d001196b89fa9cf0a4ab79766cd835b991a166e4b621ba95089edc50c429ff", size = 415167, upload-time = "2026-05-06T15:09:48.312Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/61/863bddf0da6e9e586765414debd54b4e58db05f560902b6d00658cb88636/orjson-3.11.9-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:16969c9d369c98eb084889c6e4d2d39b77c7eb38ceccf8da2a9fff62ae908980", size = 147913, upload-time = "2026-05-06T15:09:49.733Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/8a/4081492586d75b073d60c5271a8d0f05a0955cabf1e34c8473f6fcd84235/orjson-3.11.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:63e0efbc991250c0b3143488fa57d95affcabbfc63c99c48d625dd37779aafe2", size = 136959, upload-time = "2026-05-06T15:09:51.311Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/bd/70b6ab193594d7abb875320c0a7c8335e846f28968c432c31042409c3c8d/orjson-3.11.9-cp311-cp311-win32.whl", hash = "sha256:14ed654580c1ed2bc217352ec82f91b047aef82951aa71c7f64e0dcb03c0e180", size = 131533, upload-time = "2026-05-06T15:09:52.637Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/17/1a1a228183d62d1b77e2c30d210f47dd4768b310ebe1607c63e3c0e3a71e/orjson-3.11.9-cp311-cp311-win_amd64.whl", hash = "sha256:57ea77fb70a448ce87d18fca050193202a3da5e54598f6501ca5476fb66cfe02", size = 127106, upload-time = "2026-05-06T15:09:54.204Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/95/285de5fa296d09681ee9c546cd4a8aeb773b701cf343dc125994f4d52953/orjson-3.11.9-cp311-cp311-win_arm64.whl", hash = "sha256:19b72ed11572a2ee51a67a903afbe5af504f84ed6f529c0fe44b0ab3fb5cc697", size = 126848, upload-time = "2026-05-06T15:09:55.551Z" },
+    { url = "https://files.pythonhosted.org/packages/16/6d/11867a3ffa3a3608d84a4de51ef4dd0896d6b5cc9132fbe1daf593e677bc/orjson-3.11.9-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9ef6fe90aadef185c7b128859f40beb24720b4ecea95379fc9000931179c3a49", size = 228515, upload-time = "2026-05-06T15:09:57.265Z" },
+    { url = "https://files.pythonhosted.org/packages/24/75/05912954c8b288f34fcf5cd4b9b071cb4f6e77b9961e175e56ebb258089f/orjson-3.11.9-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:e5c9b8f28e726e97d97696c826bc7bea5d71cecd63576dba92924a32c1961291", size = 128409, upload-time = "2026-05-06T15:09:59.063Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/86/1c3a47df3bc8191ea9ac51603bbb872a95167a364320c269f2557911f406/orjson-3.11.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26a473dbb4162108b27901492546f83c76fdcea3d0eadff00ae7a07e18dcce09", size = 132106, upload-time = "2026-05-06T15:10:00.798Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/cf/b33b5f3e695ae7d63feef9d915c37cc3b8f465493dcd4f8e0b4c697a2366/orjson-3.11.9-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:011382e2a60fda9d46f1cdee31068cfc52ffe952b587d683ec0463002802a0f4", size = 127864, upload-time = "2026-05-06T15:10:02.15Z" },
+    { url = "https://files.pythonhosted.org/packages/31/6a/6cf69385a58208024fcb8c014e2141b8ce838aba6492b589f8acfff97fab/orjson-3.11.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c2d3dc759490128c5c1711a53eeaa8ee1d437fd0038ffd2b6008abf46db3f882", size = 135213, upload-time = "2026-05-06T15:10:03.515Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/f8/0b1bd3e8f2efcdd376af5c8cfd79eaf13f018080c0089c80ebd724e3c7fb/orjson-3.11.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d8ea516b3726d190e1b4297e6f4e7a8650347ae053868a18163b4dd3641d1fff", size = 145994, upload-time = "2026-05-06T15:10:05.083Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/59/dab79f61044c529d2c81aecdc589b1f833a1c8dec11ba3b1c2498a02ca7e/orjson-3.11.9-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:380cdce7ba24989af81d0a7013d0aaec5d0e2a21734c0e2681b1bc4f141957fe", size = 132744, upload-time = "2026-05-06T15:10:06.853Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/a4/82b7a2fe5d8a67a59ed831b24d59a3d46ea7d207b66e1602d376541d94a6/orjson-3.11.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4fa4f0af7fa18951f7ab3fc2148e223af211bf03f59e1c6034ec3f97f21d61", size = 134014, upload-time = "2026-05-06T15:10:08.213Z" },
+    { url = "https://files.pythonhosted.org/packages/50/c7/375e83a76851b73b2e39f3bcf0e5a19e2b89bad13e5bca97d0b293d27f24/orjson-3.11.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a8f5f8bc7ce7d59f08d9f99fa510c06496164a24cb5f3d34537dbd9ca30132e2", size = 141509, upload-time = "2026-05-06T15:10:09.595Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/7c/49d5d82a3d3097f641f094f552131f1e2723b0b8cb0fa2874ab65ecfffa6/orjson-3.11.9-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:4d7fde5501b944f83b3e665e1b31343ff6e154b15560a16b7130ea1e594a4206", size = 415127, upload-time = "2026-05-06T15:10:11.049Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/dc/7446c538590d55f455647e5f3c61fc33f7108714e7afcffa6a2a033f8350/orjson-3.11.9-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cde1a448023ba7d5bb4c01c5afb48894380b5e4956e0627266526587ef4e535f", size = 148025, upload-time = "2026-05-06T15:10:12.842Z" },
+    { url = "https://files.pythonhosted.org/packages/df/e5/4d2d8af06f788329b4f78f8cc3679bb395392fcaa1e4d8d3c33e85308fa4/orjson-3.11.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:71e63adb0e1f1ed5d9e168f50a91ceb93ae6420731d222dc7da5c69409aa47aa", size = 136943, upload-time = "2026-05-06T15:10:14.405Z" },
+    { url = "https://files.pythonhosted.org/packages/06/69/850264ccf6d80f6b174620d30a87f65c9b1490aba33fe6b62798e618cad3/orjson-3.11.9-cp312-cp312-win32.whl", hash = "sha256:2d057a602cdd19a0ad680417527c45b6961a095081c0f46fe0e03e304aac6470", size = 131606, upload-time = "2026-05-06T15:10:15.791Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/d5/973a43fc9c55e20f2051e9830997649f669be0cb3ca52192087c0143f118/orjson-3.11.9-cp312-cp312-win_amd64.whl", hash = "sha256:59e403b1cc5a676da8eaf31f6254801b7341b3e29efa85f92b48d272637e77be", size = 127101, upload-time = "2026-05-06T15:10:17.129Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/ae/495470f0e4a18f73fa10b7f6b84b464ec4cc5291c4e0c7c2a6c400bef006/orjson-3.11.9-cp312-cp312-win_arm64.whl", hash = "sha256:9af678d6488357948f1f84c6cd1c1d397c014e1ae2f98ae082a44eb48f602624", size = 126736, upload-time = "2026-05-06T15:10:18.645Z" },
+]
+
+[[package]]
+name = "outlines-core"
+version = "0.2.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/d3/e04e9145f8f806723dec9b9e5227ad695a3efcd3ced7794cf7c22b15df5e/outlines_core-0.2.11.tar.gz", hash = "sha256:dfce56f717ff5083e54cbcfdb66cad243365437fccbb5509adaa7e31e030f1d8", size = 197263, upload-time = "2025-05-19T10:12:51.719Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/8f/83c83e2afd142067c7f3cf2e152809195eee72d6a9b6c8745f13b827273d/outlines_core-0.2.11-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:89d79d8454b321f60047541a896d410ca9db631d241960266c4fe839cf5cd1b1", size = 1961650, upload-time = "2025-05-19T10:11:53.12Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/e9/c6b99b4364b7026b71badc06b9809a2fc4154d6b0c475bc03ab4471f81e5/outlines_core-0.2.11-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:44d581893f8644da02db7be11887229a40d26077cbdd22072ad1ed1db0ad0b2d", size = 2133920, upload-time = "2025-05-19T10:11:55.15Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/b8/cfa2bd8e1260eb1870c42a1a34389e9673a12335d09004ea6f1c82266a5e/outlines_core-0.2.11-cp310-cp310-macosx_15_0_arm64.whl", hash = "sha256:e88b7f717915d91136d915adb65c2603d2aa6457ec3fc336884bdb0b28d3188a", size = 1960688, upload-time = "2025-05-19T10:11:56.773Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/02/4cffd04e360e315b060692bf1a80f84bac1671ef90f12daf765db6d68791/outlines_core-0.2.11-cp310-cp310-macosx_15_0_x86_64.whl", hash = "sha256:8c7ecdba2162e9b30b837251387c26b1a23f80f58d01d02e7600e4b1962c5333", size = 2130263, upload-time = "2025-05-19T10:11:58.1Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/85/69a450a486824026eca181a8d573aae3ecfdb25f0c2af852065dde17a372/outlines_core-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5fcefd221c10c95ce74838869450c6fdbbe2f581f0ba27e57a95232bd88c3a", size = 2289453, upload-time = "2025-05-19T10:11:59.919Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/3c/d7cb3eac6870a68b9034854fbfa07e67abfa1fa0d92198b9fee83fe6d044/outlines_core-0.2.11-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a3c7774b112106f3afe931c65637fb3e0725d43707ceff1d34d6899cf0fa8200", size = 2115289, upload-time = "2025-05-19T10:12:01.527Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/5f/4cef22e2cf1ec286bd78c0052a0fa7ecf8519144477e7d4e276cbd70c625/outlines_core-0.2.11-cp310-cp310-win32.whl", hash = "sha256:1cfbb4cdcf34be5c6b08d279928b2b1050ed4c5e96e6e8405e3e624305c6799e", size = 1768059, upload-time = "2025-05-19T10:12:03.058Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/3a/ce6aceb6545bb1e13cf05c1f34468c5c14c8c8be92cdabcf777b4bb067ef/outlines_core-0.2.11-cp310-cp310-win_amd64.whl", hash = "sha256:670c1c1fca26fb5c7f00dbb11d1f81cca4204863c3dfdeee82017a6846397bf9", size = 2062413, upload-time = "2025-05-19T10:12:05.097Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/ca/d5e92e197b40f62deb46dcc55567a51c8bf37943df7bc6658d93f30740f1/outlines_core-0.2.11-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:e96b8d0b56afcd3b86f4efca466c578f3725da1148ef62423249c92993841762", size = 1961746, upload-time = "2025-05-19T10:12:06.723Z" },
+    { url = "https://files.pythonhosted.org/packages/02/b2/f3d6e7e37ebe1de3c345b53d8dc01e9b5c5f05b20e494fe94bf8972db4b0/outlines_core-0.2.11-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:d108ee8cd5e2fe71c2b0720b949d004901fec8bdb64bcd0c01b8abe38ab7ae1c", size = 2133815, upload-time = "2025-05-19T10:12:07.934Z" },
+    { url = "https://files.pythonhosted.org/packages/07/21/62a680da6941b53d765160d22bdcf35849c22b7a987f4e9e8b7db7885c9f/outlines_core-0.2.11-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:ebf42ab5b7ae38235d3c3333b5cacd6e91449b87b8a48a85094ea28ad9de9878", size = 1960539, upload-time = "2025-05-19T10:12:09.23Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/57/20cfb402aee1a7be0e08d861349570255ad2d17ba7fe7f8fd5706326588c/outlines_core-0.2.11-cp311-cp311-macosx_15_0_x86_64.whl", hash = "sha256:fd4305ff8418d14059d95dc3276ca96ba1b5aa499908e1af8bb3c7207aa7ac68", size = 2129894, upload-time = "2025-05-19T10:12:10.534Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/db/32c6e1170f139420e948fdd18a09a6175244bc0760dcf4dc2470e18411b9/outlines_core-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:132605b8dd1e3d1369da6a851992dd357f6376068292f6bd47caa7a28b794d19", size = 2289078, upload-time = "2025-05-19T10:12:12.118Z" },
+    { url = "https://files.pythonhosted.org/packages/25/c3/b6e6f4e08fa84d2424f82705a6dc47fee33cb91989010fa678736957dcf6/outlines_core-0.2.11-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b31d5fc83b78aad282dd667b8d6e684614481fe08a7609ce0ce45dee64cd2991", size = 2115075, upload-time = "2025-05-19T10:12:13.761Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/9b/b84c4933e4f35b34e9b23fadd63a365ad8563cc7561d8528b33de4ee8102/outlines_core-0.2.11-cp311-cp311-win32.whl", hash = "sha256:3e316a79f3ecfa12c17746edebcbd66538ee22a43986982f6b96166fb94ee6b1", size = 1768254, upload-time = "2025-05-19T10:12:15.02Z" },
+    { url = "https://files.pythonhosted.org/packages/99/5b/380c933c65ca9744c163fe4a3702ad7f3e9ca02e09ac84a09b6837cff9b6/outlines_core-0.2.11-cp311-cp311-win_amd64.whl", hash = "sha256:c260a042b5854ff69291649cfd112066e6bab0dad0bb9cec8a6c3705ef3a59cd", size = 2062167, upload-time = "2025-05-19T10:12:16.443Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/2c/c7636823244c70e2960060bf9bd978248dffb55c5e7c91c46d18354b2a24/outlines_core-0.2.11-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:4a9db4872bae083631d720994f4cee603bce0536b33d5a988814576863b657cf", size = 1957668, upload-time = "2025-05-19T10:12:18.29Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/09/5c62047da139d722317a444a4d01cd5f11943a8c2eaecce784341dd0844a/outlines_core-0.2.11-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8359a45c59f6a8f2eb717245806501a59044c75f6ea8bd08faaa131cc8cdec45", size = 2130493, upload-time = "2025-05-19T10:12:19.537Z" },
+    { url = "https://files.pythonhosted.org/packages/89/7a/d6a2810f90e37d550168e0c0a9a915086ea721444727e3ca2c630898d1ef/outlines_core-0.2.11-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:5d26a46591377340e0b870b8a96ea8341058341a62ee0bded9098e0c88dd24f4", size = 1956804, upload-time = "2025-05-19T10:12:20.755Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/ea/339e6c273b5581128c3b7ca27d428d8993c3085912af1a467aa32ef0e9d1/outlines_core-0.2.11-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:ae460a34675fb11d92a5c605a480fbae4cd6c1b2d11b3698da64a7fcaba64dcf", size = 2127085, upload-time = "2025-05-19T10:12:22.02Z" },
+    { url = "https://files.pythonhosted.org/packages/92/c7/a65d1fddf49830ebc41422294eacde35286d9f68994a8aa905cb14f5aade/outlines_core-0.2.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86df9740368866295077346440d911df4972da2b3f1f54b8125e6f329e8a8891", size = 2287677, upload-time = "2025-05-19T10:12:24.24Z" },
+    { url = "https://files.pythonhosted.org/packages/23/79/8795aed8be9b77dd69d78e7cfbfcf28c179e6b08da6e56bbbf48a09fe55f/outlines_core-0.2.11-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:96ce4dd78f106799be4a0a5795cefd1352806162973756a4b6fce4bb6eddd7e4", size = 2113000, upload-time = "2025-05-19T10:12:25.446Z" },
+    { url = "https://files.pythonhosted.org/packages/59/e3/cbe9294b06d92ee1892dbb6f2125d833d68e8629d45d080d6daba54eec2d/outlines_core-0.2.11-cp312-cp312-win32.whl", hash = "sha256:358db161cce3650ba822e118dcf0a1efa571c7deb4864ab9d64ca2c9cca7425d", size = 1765703, upload-time = "2025-05-19T10:12:26.693Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/c9/ed3cf362515fac16e313368b9b2f2497051f4ded88679205830b6f889f54/outlines_core-0.2.11-cp312-cp312-win_amd64.whl", hash = "sha256:231f9d20d2630c70665345821780d7808b29539620a75c99f65113b518c51032", size = 2060945, upload-time = "2025-05-19T10:12:28.294Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -2848,6 +3927,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/85/8d/eef3d8cdccc32abdd91b1286884c99b8c3a6d3b135affcc2a7a0f383bb32/parse_type-0.6.6-py2.py3-none-any.whl", hash = "sha256:3ca79bbe71e170dfccc8ec6c341edfd1c2a0fc1e5cfd18330f93af938de2348c", size = 27085, upload-time = "2025-08-11T22:53:46.396Z" },
 ]
 
+[[package]]
+name = "partial-json-parser"
+version = "0.2.1.1.post7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/6d/eed37d7ebc1e0bcd27b831c0cf1fe94881934316187c4b30d23f29ea0bd4/partial_json_parser-0.2.1.1.post7.tar.gz", hash = "sha256:86590e1ba6bcb6739a2dfc17d2323f028cb5884f4c6ce23db376999132c9a922", size = 10296, upload-time = "2025-11-17T07:27:41.202Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/32/658973117bf0fd82a24abbfb94fe73a5e86216e49342985e10acce54775a/partial_json_parser-0.2.1.1.post7-py3-none-any.whl", hash = "sha256:145119e5eabcf80cbb13844a6b50a85c68bf99d376f8ed771e2a3c3b03e653ae", size = 10877, upload-time = "2025-11-17T07:27:40.457Z" },
+]
+
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -2878,10 +3966,77 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/9d/5f95bfb298c8d3b4e3a107701f9a4e7774a0d4d1f8eb0c9d5420b80f7c9d/peft-0.13.2-py3-none-any.whl", hash = "sha256:d4e0951ec78eac11c45a051801c569913436888c578d48e5ce86996b715bc6ef", size = 320731, upload-time = "2024-10-11T11:42:18.905Z" },
 ]
 
+[[package]]
+name = "pillow"
+version = "11.3.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform != 'win32'",
+    "python_full_version == '3.11' and sys_platform != 'win32'",
+    "python_full_version >= '3.12' and sys_platform == 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform == 'win32'",
+    "python_full_version == '3.11' and sys_platform == 'win32'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/5d/45a3553a253ac8763f3561371432a90bdbe6000fbdcf1397ffe502aa206c/pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860", size = 5316554, upload-time = "2025-07-01T09:13:39.342Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/c8/67c12ab069ef586a25a4a79ced553586748fad100c77c0ce59bb4983ac98/pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad", size = 4686548, upload-time = "2025-07-01T09:13:41.835Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/bd/6741ebd56263390b382ae4c5de02979af7f8bd9807346d068700dd6d5cf9/pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0", size = 5859742, upload-time = "2025-07-03T13:09:47.439Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/0b/c412a9e27e1e6a829e6ab6c2dca52dd563efbedf4c9c6aa453d9a9b77359/pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b", size = 7633087, upload-time = "2025-07-03T13:09:51.796Z" },
+    { url = "https://files.pythonhosted.org/packages/59/9d/9b7076aaf30f5dd17e5e5589b2d2f5a5d7e30ff67a171eb686e4eecc2adf/pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50", size = 5963350, upload-time = "2025-07-01T09:13:43.865Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/16/1a6bf01fb622fb9cf5c91683823f073f053005c849b1f52ed613afcf8dae/pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae", size = 6631840, upload-time = "2025-07-01T09:13:46.161Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/e6/6ff7077077eb47fde78739e7d570bdcd7c10495666b6afcd23ab56b19a43/pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9", size = 6074005, upload-time = "2025-07-01T09:13:47.829Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/3a/b13f36832ea6d279a697231658199e0a03cd87ef12048016bdcc84131601/pillow-11.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040a5b691b0713e1f6cbe222e0f4f74cd233421e105850ae3b3c0ceda520f42e", size = 6708372, upload-time = "2025-07-01T09:13:52.145Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/e4/61b2e1a7528740efbc70b3d581f33937e38e98ef3d50b05007267a55bcb2/pillow-11.3.0-cp310-cp310-win32.whl", hash = "sha256:89bd777bc6624fe4115e9fac3352c79ed60f3bb18651420635f26e643e3dd1f6", size = 6277090, upload-time = "2025-07-01T09:13:53.915Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/d3/60c781c83a785d6afbd6a326ed4d759d141de43aa7365725cbcd65ce5e54/pillow-11.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:19d2ff547c75b8e3ff46f4d9ef969a06c30ab2d4263a9e287733aa8b2429ce8f", size = 6985988, upload-time = "2025-07-01T09:13:55.699Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/28/4f4a0203165eefb3763939c6789ba31013a2e90adffb456610f30f613850/pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f", size = 2422899, upload-time = "2025-07-01T09:13:57.497Z" },
+    { url = "https://files.pythonhosted.org/packages/db/26/77f8ed17ca4ffd60e1dcd220a6ec6d71210ba398cfa33a13a1cd614c5613/pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722", size = 5316531, upload-time = "2025-07-01T09:13:59.203Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/39/ee475903197ce709322a17a866892efb560f57900d9af2e55f86db51b0a5/pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288", size = 4686560, upload-time = "2025-07-01T09:14:01.101Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/90/442068a160fd179938ba55ec8c97050a612426fae5ec0a764e345839f76d/pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d", size = 5870978, upload-time = "2025-07-03T13:09:55.638Z" },
+    { url = "https://files.pythonhosted.org/packages/13/92/dcdd147ab02daf405387f0218dcf792dc6dd5b14d2573d40b4caeef01059/pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494", size = 7641168, upload-time = "2025-07-03T13:10:00.37Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/db/839d6ba7fd38b51af641aa904e2960e7a5644d60ec754c046b7d2aee00e5/pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58", size = 5973053, upload-time = "2025-07-01T09:14:04.491Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/2f/d7675ecae6c43e9f12aa8d58b6012683b20b6edfbdac7abcb4e6af7a3784/pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f", size = 6640273, upload-time = "2025-07-01T09:14:06.235Z" },
+    { url = "https://files.pythonhosted.org/packages/45/ad/931694675ede172e15b2ff03c8144a0ddaea1d87adb72bb07655eaffb654/pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e", size = 6082043, upload-time = "2025-07-01T09:14:07.978Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/04/ba8f2b11fc80d2dd462d7abec16351b45ec99cbbaea4387648a44190351a/pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94", size = 6715516, upload-time = "2025-07-01T09:14:10.233Z" },
+    { url = "https://files.pythonhosted.org/packages/48/59/8cd06d7f3944cc7d892e8533c56b0acb68399f640786313275faec1e3b6f/pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0", size = 6274768, upload-time = "2025-07-01T09:14:11.921Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/cc/29c0f5d64ab8eae20f3232da8f8571660aa0ab4b8f1331da5c2f5f9a938e/pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac", size = 6986055, upload-time = "2025-07-01T09:14:13.623Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/df/90bd886fabd544c25addd63e5ca6932c86f2b701d5da6c7839387a076b4a/pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd", size = 2423079, upload-time = "2025-07-01T09:14:15.268Z" },
+    { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" },
+    { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" },
+    { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" },
+    { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" },
+    { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" },
+    { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" },
+    { url = "https://files.pythonhosted.org/packages/72/c9/583821097dc691880c92892e8e2d41fe0a5a3d6021f4963371d2f6d57250/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25", size = 6583939, upload-time = "2025-07-03T13:11:15.68Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/8e/5c9d410f9217b12320efc7c413e72693f48468979a013ad17fd690397b9a/pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27", size = 4957166, upload-time = "2025-07-01T09:16:13.74Z" },
+    { url = "https://files.pythonhosted.org/packages/62/bb/78347dbe13219991877ffb3a91bf09da8317fbfcd4b5f9140aeae020ad71/pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a", size = 5581482, upload-time = "2025-07-01T09:16:16.107Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/28/1000353d5e61498aaeaaf7f1e4b49ddb05f2c6575f9d4f9f914a3538b6e1/pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f", size = 6984596, upload-time = "2025-07-01T09:16:18.07Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/e3/6fa84033758276fb31da12e5fb66ad747ae83b93c67af17f8c6ff4cc8f34/pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6", size = 5270566, upload-time = "2025-07-01T09:16:19.801Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/ee/e8d2e1ab4892970b561e1ba96cbd59c0d28cf66737fc44abb2aec3795a4e/pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438", size = 4654618, upload-time = "2025-07-01T09:16:21.818Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6d/17f80f4e1f0761f02160fc433abd4109fa1548dcfdca46cfdadaf9efa565/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3", size = 4874248, upload-time = "2025-07-03T13:11:20.738Z" },
+    { url = "https://files.pythonhosted.org/packages/de/5f/c22340acd61cef960130585bbe2120e2fd8434c214802f07e8c03596b17e/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c", size = 6583963, upload-time = "2025-07-03T13:11:26.283Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5e/03966aedfbfcbb4d5f8aa042452d3361f325b963ebbadddac05b122e47dd/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361", size = 4957170, upload-time = "2025-07-01T09:16:23.762Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/2d/e082982aacc927fc2cab48e1e731bdb1643a1406acace8bed0900a61464e/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7", size = 5581505, upload-time = "2025-07-01T09:16:25.593Z" },
+    { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
+]
+
 [[package]]
 name = "pillow"
 version = "12.0.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/08/26e68b6b5da219c2a2cb7b563af008b53bb8e6b6fcb3fa40715fcdb2523a/pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b", size = 5289809, upload-time = "2025-10-15T18:21:27.791Z" },
@@ -3051,6 +4206,10 @@ wheels = [
 name = "protobuf"
 version = "4.25.8"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/df/01/34c8d2b6354906d728703cb9d546a0e534de479e25f1b581e4094c4a85cc/protobuf-4.25.8.tar.gz", hash = "sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd", size = 380920, upload-time = "2025-05-28T14:22:25.153Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/45/ff/05f34305fe6b85bbfbecbc559d423a5985605cad5eda4f47eae9e9c9c5c5/protobuf-4.25.8-cp310-abi3-win32.whl", hash = "sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0", size = 392745, upload-time = "2025-05-28T14:22:10.524Z" },
@@ -3061,6 +4220,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0c/c1/6aece0ab5209981a70cd186f164c133fdba2f51e124ff92b73de7fd24d78/protobuf-4.25.8-py3-none-any.whl", hash = "sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59", size = 156757, upload-time = "2025-05-28T14:22:24.135Z" },
 ]
 
+[[package]]
+name = "protobuf"
+version = "5.29.6"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform != 'win32'",
+    "python_full_version == '3.11' and sys_platform != 'win32'",
+    "python_full_version >= '3.12' and sys_platform == 'win32'",
+    "python_full_version > '3.11' and python_full_version < '3.12' and sys_platform == 'win32'",
+    "python_full_version == '3.11' and sys_platform == 'win32'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7e/57/394a763c103e0edf87f0938dafcd918d53b4c011dfc5c8ae80f3b0452dbb/protobuf-5.29.6.tar.gz", hash = "sha256:da9ee6a5424b6b30fd5e45c5ea663aef540ca95f9ad99d1e887e819cdf9b8723", size = 425623, upload-time = "2026-02-04T22:54:40.584Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/88/9ee58ff7863c479d6f8346686d4636dd4c415b0cbeed7a6a7d0617639c2a/protobuf-5.29.6-cp310-abi3-win32.whl", hash = "sha256:62e8a3114992c7c647bce37dcc93647575fc52d50e48de30c6fcb28a6a291eb1", size = 423357, upload-time = "2026-02-04T22:54:25.805Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/66/2dc736a4d576847134fb6d80bd995c569b13cdc7b815d669050bf0ce2d2c/protobuf-5.29.6-cp310-abi3-win_amd64.whl", hash = "sha256:7e6ad413275be172f67fdee0f43484b6de5a904cc1c3ea9804cb6fe2ff366eda", size = 435175, upload-time = "2026-02-04T22:54:28.592Z" },
+    { url = "https://files.pythonhosted.org/packages/06/db/49b05966fd208ae3f44dcd33837b6243b4915c57561d730a43f881f24dea/protobuf-5.29.6-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:b5a169e664b4057183a34bdc424540e86eea47560f3c123a0d64de4e137f9269", size = 418619, upload-time = "2026-02-04T22:54:30.266Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/d7/48cbf6b0c3c39761e47a99cb483405f0fde2be22cf00d71ef316ce52b458/protobuf-5.29.6-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:a8866b2cff111f0f863c1b3b9e7572dc7eaea23a7fae27f6fc613304046483e6", size = 320284, upload-time = "2026-02-04T22:54:31.782Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/dd/cadd6ec43069247d91f6345fa7a0d2858bef6af366dbd7ba8f05d2c77d3b/protobuf-5.29.6-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:e3387f44798ac1106af0233c04fb8abf543772ff241169946f698b3a9a3d3ab9", size = 320478, upload-time = "2026-02-04T22:54:32.909Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/cb/e3065b447186cb70aa65acc70c86baf482d82bf75625bf5a2c4f6919c6a3/protobuf-5.29.6-py3-none-any.whl", hash = "sha256:6b9edb641441b2da9fa8f428760fc136a49cf97a52076010cf22a2ff73438a86", size = 173126, upload-time = "2026-02-04T22:54:39.462Z" },
+]
+
 [[package]]
 name = "psutil"
 version = "7.1.3"
@@ -3142,6 +4323,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708, upload-time = "2021-11-04T17:17:00.152Z" },
 ]
 
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" },
+]
+
 [[package]]
 name = "pyarrow"
 version = "17.0.0"
@@ -3195,6 +4385,102 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
 ]
 
+[[package]]
+name = "pybase64"
+version = "1.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/b8/4ed5c7ad5ec15b08d35cc79ace6145d5c1ae426e46435f4987379439dfea/pybase64-1.4.3.tar.gz", hash = "sha256:c2ed274c9e0ba9c8f9c4083cfe265e66dd679126cd9c2027965d807352f3f053", size = 137272, upload-time = "2025-12-06T13:27:04.013Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/47/16d7af6fae7803f4c691856bc0d8d433ccf30e106432e2ef7707ee19a38a/pybase64-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f63aa7f29139b8a05ce5f97cdb7fad63d29071e5bdc8a638a343311fe996112a", size = 38241, upload-time = "2025-12-06T13:22:27.396Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/3e/268beb8d2240ab55396af4d1b45d2494935982212549b92a5f5b57079bd3/pybase64-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5943ec1ae87a8b4fe310905bb57205ea4330c75e2c628433a7d9dd52295b588", size = 31672, upload-time = "2025-12-06T13:22:28.854Z" },
+    { url = "https://files.pythonhosted.org/packages/80/14/4365fa33222edcc46b6db4973f9e22bda82adfb6ab2a01afff591f1e41c8/pybase64-1.4.3-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:5f2b8aef86f35cd5894c13681faf433a1fffc5b2e76544dcb5416a514a1a8347", size = 65978, upload-time = "2025-12-06T13:22:30.191Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/22/e89739d8bc9b96c68ead44b4eec42fe555683d9997e4ba65216d384920fc/pybase64-1.4.3-cp310-cp310-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a6ec7e53dd09b0a8116ccf5c3265c7c7fce13c980747525be76902aef36a514a", size = 68903, upload-time = "2025-12-06T13:22:31.29Z" },
+    { url = "https://files.pythonhosted.org/packages/77/e1/7e59a19f8999cdefe9eb0d56bfd701dd38263b0f6fb4a4d29fce165a1b36/pybase64-1.4.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7528604cd69c538e1dbaafded46e9e4915a2adcd6f2a60fcef6390d87ca922ea", size = 57516, upload-time = "2025-12-06T13:22:32.395Z" },
+    { url = "https://files.pythonhosted.org/packages/42/ad/f47dc7e6fe32022b176868b88b671a32dab389718c8ca905cab79280aaaf/pybase64-1.4.3-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:4ec645f32b50593879031e09158f8681a1db9f5df0f72af86b3969a1c5d1fa2b", size = 54533, upload-time = "2025-12-06T13:22:33.457Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/9a/7ab312b5a324833953b00e47b23eb4f83d45bd5c5c854b4b4e51b2a0cf5b/pybase64-1.4.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:634a000c5b3485ccc18bb9b244e0124f74b6fbc7f43eade815170237a7b34c64", size = 57187, upload-time = "2025-12-06T13:22:34.566Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/84/80acab1fcbaaae103e6b862ef5019192c8f2cd8758433595a202179a0d1d/pybase64-1.4.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:309ea32ad07639a485580af1be0ad447a434deb1924e76adced63ac2319cfe15", size = 57730, upload-time = "2025-12-06T13:22:35.581Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/24/84256d472400ea3163d7d69c44bb7e2e1027f0f1d4d20c47629a7dc4578e/pybase64-1.4.3-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:d10d517566b748d3f25f6ac7162af779360c1c6426ad5f962927ee205990d27c", size = 53036, upload-time = "2025-12-06T13:22:36.621Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/0f/33aecbed312ee0431798a73fa25e00dedbffdd91389ee23121fed397c550/pybase64-1.4.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a74cc0f4d835400857cc5c6d27ec854f7949491e07a04e6d66e2137812831f4c", size = 56321, upload-time = "2025-12-06T13:22:37.7Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/1c/a341b050746658cbec8cab3c733aeb3ef52ce8f11e60d0d47adbdf729ebf/pybase64-1.4.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:1b591d774ac09d5eb73c156a03277cb271438fbd8042bae4109ff3a827cd218c", size = 50114, upload-time = "2025-12-06T13:22:38.752Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/d3/f7e6680ae6dc4ddff39112ad66e0fa6b2ec346e73881bafc08498c560bc0/pybase64-1.4.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5eb588d35a04302ef6157d17db62354a787ac6f8b1585dd0b90c33d63a97a550", size = 66570, upload-time = "2025-12-06T13:22:40.221Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/71/774748eecc7fe23869b7e5df028e3c4c2efa16b506b83ea3fa035ea95dc2/pybase64-1.4.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:df8b122d5be2c96962231cc4831d9c2e1eae6736fb12850cec4356d8b06fe6f8", size = 55700, upload-time = "2025-12-06T13:22:41.289Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/91/dd15075bb2fe0086193e1cd4bad80a43652c38d8a572f9218d46ba721802/pybase64-1.4.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:31b7a85c661fc591bbcce82fb8adaebe2941e6a83b08444b0957b77380452a4b", size = 52491, upload-time = "2025-12-06T13:22:42.628Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/27/f357d63ea3774c937fc47160e040419ed528827aa3d4306d5ec9826259c0/pybase64-1.4.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e6d7beaae65979fef250e25e66cf81c68a8f81910bcda1a2f43297ab486a7e4e", size = 53957, upload-time = "2025-12-06T13:22:44.615Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/c3/243693771701a54e67ff5ccbf4c038344f429613f5643169a7befc51f007/pybase64-1.4.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4a6276bc3a3962d172a2b5aba544d89881c4037ea954517b86b00892c703d007", size = 68422, upload-time = "2025-12-06T13:22:45.641Z" },
+    { url = "https://files.pythonhosted.org/packages/75/95/f987081bf6bc1d1eda3012dae1b06ad427732ef9933a632cb8b58f9917f8/pybase64-1.4.3-cp310-cp310-win32.whl", hash = "sha256:4bdd07ef017515204ee6eaab17e1ad05f83c0ccb5af8ae24a0fe6d9cb5bb0b7a", size = 33622, upload-time = "2025-12-06T13:22:47.348Z" },
+    { url = "https://files.pythonhosted.org/packages/79/28/c169a769fe90128f16d394aad87b2096dd4bf2f035ae0927108a46b617df/pybase64-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:5db0b6bbda15110db2740c61970a8fda3bf9c93c3166a3f57f87c7865ed1125c", size = 35799, upload-time = "2025-12-06T13:22:48.731Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/f2/bdbe6af0bd4f3fe5bc70e77ead7f7d523bb9d3ca3ad50ac42b9adbb9ca14/pybase64-1.4.3-cp310-cp310-win_arm64.whl", hash = "sha256:f96367dfc82598569aa02b1103ebd419298293e59e1151abda2b41728703284b", size = 31158, upload-time = "2025-12-06T13:22:50.021Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/63/21e981e9d3f1f123e0b0ee2130112b1956cad9752309f574862c7ae77c08/pybase64-1.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:70b0d4a4d54e216ce42c2655315378b8903933ecfa32fced453989a92b4317b2", size = 38237, upload-time = "2025-12-06T13:22:52.159Z" },
+    { url = "https://files.pythonhosted.org/packages/92/fb/3f448e139516404d2a3963915cc10dc9dde7d3a67de4edba2f827adfef17/pybase64-1.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8127f110cdee7a70e576c5c9c1d4e17e92e76c191869085efbc50419f4ae3c72", size = 31673, upload-time = "2025-12-06T13:22:53.241Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/fb/bb06a5b9885e7d853ac1e801c4d8abfdb4c8506deee33e53d55aa6690e67/pybase64-1.4.3-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f9ef0388878bc15a084bd9bf73ec1b2b4ee513d11009b1506375e10a7aae5032", size = 68331, upload-time = "2025-12-06T13:22:54.197Z" },
+    { url = "https://files.pythonhosted.org/packages/64/15/8d60b9ec5e658185fc2ee3333e01a6e30d717cf677b24f47cbb3a859d13c/pybase64-1.4.3-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:95a57cccf106352a72ed8bc8198f6820b16cc7d55aa3867a16dea7011ae7c218", size = 71370, upload-time = "2025-12-06T13:22:55.517Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/29/a3e5c1667cc8c38d025a4636855de0fc117fc62e2afeb033a3c6f12c6a22/pybase64-1.4.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cd1c47dfceb9c7bd3de210fb4e65904053ed2d7c9dce6d107f041ff6fbd7e21", size = 59834, upload-time = "2025-12-06T13:22:56.682Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/00/8ffcf9810bd23f3984698be161cf7edba656fd639b818039a7be1d6405d4/pybase64-1.4.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:9fe9922698f3e2f72874b26890d53a051c431d942701bb3a37aae94da0b12107", size = 56652, upload-time = "2025-12-06T13:22:57.724Z" },
+    { url = "https://files.pythonhosted.org/packages/81/62/379e347797cdea4ab686375945bc77ad8d039c688c0d4d0cfb09d247beb9/pybase64-1.4.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:af5f4bd29c86b59bb4375e0491d16ec8a67548fa99c54763aaedaf0b4b5a6632", size = 59382, upload-time = "2025-12-06T13:22:58.758Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/f2/9338ffe2f487086f26a2c8ca175acb3baa86fce0a756ff5670a0822bb877/pybase64-1.4.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c302f6ca7465262908131411226e02100f488f531bb5e64cb901aa3f439bccd9", size = 59990, upload-time = "2025-12-06T13:23:01.007Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/a4/85a6142b65b4df8625b337727aa81dc199642de3d09677804141df6ee312/pybase64-1.4.3-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2f3f439fa4d7fde164ebbbb41968db7d66b064450ab6017c6c95cef0afa2b349", size = 54923, upload-time = "2025-12-06T13:23:02.369Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/00/e40215d25624012bf5b7416ca37f168cb75f6dd15acdb91ea1f2ea4dc4e7/pybase64-1.4.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a23c6866551043f8b681a5e1e0d59469148b2920a3b4fc42b1275f25ea4217a", size = 58664, upload-time = "2025-12-06T13:23:03.378Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/73/d7e19a63e795c13837f2356268d95dc79d1180e756f57ced742a1e52fdeb/pybase64-1.4.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:56e6526f8565642abc5f84338cc131ce298a8ccab696b19bdf76fa6d7dc592ef", size = 52338, upload-time = "2025-12-06T13:23:04.458Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/32/3c746d7a310b69bdd9df77ffc85c41b80bce00a774717596f869b0d4a20e/pybase64-1.4.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6a792a8b9d866ffa413c9687d9b611553203753987a3a582d68cbc51cf23da45", size = 68993, upload-time = "2025-12-06T13:23:05.526Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/b3/63cec68f9d6f6e4c0b438d14e5f1ef536a5fe63ce14b70733ac5e31d7ab8/pybase64-1.4.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:62ad29a5026bb22cfcd1ca484ec34b0a5ced56ddba38ceecd9359b2818c9c4f9", size = 58055, upload-time = "2025-12-06T13:23:06.931Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/cb/7acf7c3c06f9692093c07f109668725dc37fb9a3df0fa912b50add645195/pybase64-1.4.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:11b9d1d2d32ec358c02214363b8fc3651f6be7dd84d880ecd597a6206a80e121", size = 54430, upload-time = "2025-12-06T13:23:07.936Z" },
+    { url = "https://files.pythonhosted.org/packages/33/39/4eb33ff35d173bfff4002e184ce8907f5d0a42d958d61cd9058ef3570179/pybase64-1.4.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0aebaa7f238caa0a0d373616016e2040c6c879ebce3ba7ab3c59029920f13640", size = 56272, upload-time = "2025-12-06T13:23:09.253Z" },
+    { url = "https://files.pythonhosted.org/packages/19/97/a76d65c375a254e65b730c6f56bf528feca91305da32eceab8bcc08591e6/pybase64-1.4.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e504682b20c63c2b0c000e5f98a80ea867f8d97642e042a5a39818e44ba4d599", size = 70904, upload-time = "2025-12-06T13:23:10.336Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/2c/8338b6d3da3c265002839e92af0a80d6db88385c313c73f103dfb800c857/pybase64-1.4.3-cp311-cp311-win32.whl", hash = "sha256:e9a8b81984e3c6fb1db9e1614341b0a2d98c0033d693d90c726677db1ffa3a4c", size = 33639, upload-time = "2025-12-06T13:23:11.9Z" },
+    { url = "https://files.pythonhosted.org/packages/39/dc/32efdf2f5927e5449cc341c266a1bbc5fecd5319a8807d9c5405f76e6d02/pybase64-1.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:a90a8fa16a901fabf20de824d7acce07586e6127dc2333f1de05f73b1f848319", size = 35797, upload-time = "2025-12-06T13:23:13.174Z" },
+    { url = "https://files.pythonhosted.org/packages/da/59/eda4f9cb0cbce5a45f0cd06131e710674f8123a4d570772c5b9694f88559/pybase64-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:61d87de5bc94d143622e94390ec3e11b9c1d4644fe9be3a81068ab0f91056f59", size = 31160, upload-time = "2025-12-06T13:23:15.696Z" },
+    { url = "https://files.pythonhosted.org/packages/86/a7/efcaa564f091a2af7f18a83c1c4875b1437db56ba39540451dc85d56f653/pybase64-1.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:18d85e5ab8b986bb32d8446aca6258ed80d1bafe3603c437690b352c648f5967", size = 38167, upload-time = "2025-12-06T13:23:16.821Z" },
+    { url = "https://files.pythonhosted.org/packages/db/c7/c7ad35adff2d272bf2930132db2b3eea8c44bb1b1f64eb9b2b8e57cde7b4/pybase64-1.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3f5791a3491d116d0deaf4d83268f48792998519698f8751efb191eac84320e9", size = 31673, upload-time = "2025-12-06T13:23:17.835Z" },
+    { url = "https://files.pythonhosted.org/packages/43/1b/9a8cab0042b464e9a876d5c65fe5127445a2436da36fda64899b119b1a1b/pybase64-1.4.3-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f0b3f200c3e06316f6bebabd458b4e4bcd4c2ca26af7c0c766614d91968dee27", size = 68210, upload-time = "2025-12-06T13:23:18.813Z" },
+    { url = "https://files.pythonhosted.org/packages/62/f7/965b79ff391ad208b50e412b5d3205ccce372a2d27b7218ae86d5295b105/pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb632edfd132b3eaf90c39c89aa314beec4e946e210099b57d40311f704e11d4", size = 71599, upload-time = "2025-12-06T13:23:20.195Z" },
+    { url = "https://files.pythonhosted.org/packages/03/4b/a3b5175130b3810bbb8ccfa1edaadbd3afddb9992d877c8a1e2f274b476e/pybase64-1.4.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:356ef1d74648ce997f5a777cf8f1aefecc1c0b4fe6201e0ef3ec8a08170e1b54", size = 59922, upload-time = "2025-12-06T13:23:21.487Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5d/c38d1572027fc601b62d7a407721688b04b4d065d60ca489912d6893e6cf/pybase64-1.4.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:c48361f90db32bacaa5518419d4eb9066ba558013aaf0c7781620279ecddaeb9", size = 56712, upload-time = "2025-12-06T13:23:22.77Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/d4/4e04472fef485caa8f561d904d4d69210a8f8fc1608ea15ebd9012b92655/pybase64-1.4.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:702bcaa16ae02139d881aeaef5b1c8ffb4a3fae062fe601d1e3835e10310a517", size = 59300, upload-time = "2025-12-06T13:23:24.543Z" },
+    { url = "https://files.pythonhosted.org/packages/86/e7/16e29721b86734b881d09b7e23dfd7c8408ad01a4f4c7525f3b1088e25ec/pybase64-1.4.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:53d0ffe1847b16b647c6413d34d1de08942b7724273dd57e67dcbdb10c574045", size = 60278, upload-time = "2025-12-06T13:23:25.608Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/02/18515f211d7c046be32070709a8efeeef8a0203de4fd7521e6b56404731b/pybase64-1.4.3-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:9a1792e8b830a92736dae58f0c386062eb038dfe8004fb03ba33b6083d89cd43", size = 54817, upload-time = "2025-12-06T13:23:26.633Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/be/14e29d8e1a481dbff151324c96dd7b5d2688194bb65dc8a00ca0e1ad1e86/pybase64-1.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d468b1b1ac5ad84875a46eaa458663c3721e8be5f155ade356406848d3701f6", size = 58611, upload-time = "2025-12-06T13:23:27.684Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/8a/a2588dfe24e1bbd742a554553778ab0d65fdf3d1c9a06d10b77047d142aa/pybase64-1.4.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:e97b7bdbd62e71898cd542a6a9e320d9da754ff3ebd02cb802d69087ee94d468", size = 52404, upload-time = "2025-12-06T13:23:28.714Z" },
+    { url = "https://files.pythonhosted.org/packages/27/fc/afcda7445bebe0cbc38cafdd7813234cdd4fc5573ff067f1abf317bb0cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b33aeaa780caaa08ffda87fc584d5eab61e3d3bbb5d86ead02161dc0c20d04bc", size = 68817, upload-time = "2025-12-06T13:23:30.079Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/3a/87c3201e555ed71f73e961a787241a2438c2bbb2ca8809c29ddf938a3157/pybase64-1.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c0efcf78f11cf866bed49caa7b97552bc4855a892f9cc2372abcd3ed0056f0d", size = 57854, upload-time = "2025-12-06T13:23:31.17Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/7d/931c2539b31a7b375e7d595b88401eeb5bd6c5ce1059c9123f9b608aaa14/pybase64-1.4.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:66e3791f2ed725a46593f8bd2761ff37d01e2cdad065b1dceb89066f476e50c6", size = 54333, upload-time = "2025-12-06T13:23:32.422Z" },
+    { url = "https://files.pythonhosted.org/packages/de/5e/537601e02cc01f27e9d75f440f1a6095b8df44fc28b1eef2cd739aea8cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:72bb0b6bddadab26e1b069bb78e83092711a111a80a0d6b9edcb08199ad7299b", size = 56492, upload-time = "2025-12-06T13:23:33.515Z" },
+    { url = "https://files.pythonhosted.org/packages/96/97/2a2e57acf8f5c9258d22aba52e71f8050e167b29ed2ee1113677c1b600c1/pybase64-1.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5b3365dbcbcdb0a294f0f50af0c0a16b27a232eddeeb0bceeefd844ef30d2a23", size = 70974, upload-time = "2025-12-06T13:23:36.27Z" },
+    { url = "https://files.pythonhosted.org/packages/75/2e/a9e28941c6dab6f06e6d3f6783d3373044be9b0f9a9d3492c3d8d2260ac0/pybase64-1.4.3-cp312-cp312-win32.whl", hash = "sha256:7bca1ed3a5df53305c629ca94276966272eda33c0d71f862d2d3d043f1e1b91a", size = 33686, upload-time = "2025-12-06T13:23:37.848Z" },
+    { url = "https://files.pythonhosted.org/packages/83/e3/507ab649d8c3512c258819c51d25c45d6e29d9ca33992593059e7b646a33/pybase64-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:9f2da8f56d9b891b18b4daf463a0640eae45a80af548ce435be86aa6eff3603b", size = 35833, upload-time = "2025-12-06T13:23:38.877Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/8a/6eba66cd549a2fc74bb4425fd61b839ba0ab3022d3c401b8a8dc2cc00c7a/pybase64-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:0631d8a2d035de03aa9bded029b9513e1fee8ed80b7ddef6b8e9389ffc445da0", size = 31185, upload-time = "2025-12-06T13:23:39.908Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/7c/545fd4935a0e1ddd7147f557bf8157c73eecec9cffd523382fa7af2557de/pybase64-1.4.3-graalpy311-graalpy242_311_native-macosx_10_9_x86_64.whl", hash = "sha256:d27c1dfdb0c59a5e758e7a98bd78eaca5983c22f4a811a36f4f980d245df4611", size = 38393, upload-time = "2025-12-06T13:26:19.535Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ca/ae7a96be9ddc96030d4e9dffc43635d4e136b12058b387fd47eb8301b60f/pybase64-1.4.3-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0f1a0c51d6f159511e3431b73c25db31095ee36c394e26a4349e067c62f434e5", size = 32109, upload-time = "2025-12-06T13:26:20.72Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/44/d4b7adc7bf4fd5b52d8d099121760c450a52c390223806b873f0b6a2d551/pybase64-1.4.3-graalpy311-graalpy242_311_native-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a492518f3078a4e3faaef310697d21df9c6bc71908cebc8c2f6fbfa16d7d6b1f", size = 43227, upload-time = "2025-12-06T13:26:21.845Z" },
+    { url = "https://files.pythonhosted.org/packages/08/86/2ba2d8734ef7939debeb52cf9952e457ba7aa226cae5c0e6dd631f9b851f/pybase64-1.4.3-graalpy311-graalpy242_311_native-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae1a0f47784fd16df90d8acc32011c8d5fcdd9ab392c9ec49543e5f6a9c43a4", size = 35804, upload-time = "2025-12-06T13:26:23.149Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/5b/19c725dc3aaa6281f2ce3ea4c1628d154a40dd99657d1381995f8096768b/pybase64-1.4.3-graalpy311-graalpy242_311_native-win_amd64.whl", hash = "sha256:03cea70676ffbd39a1ab7930a2d24c625b416cacc9d401599b1d29415a43ab6a", size = 35880, upload-time = "2025-12-06T13:26:24.663Z" },
+    { url = "https://files.pythonhosted.org/packages/17/45/92322aec1b6979e789b5710f73c59f2172bc37c8ce835305434796824b7b/pybase64-1.4.3-graalpy312-graalpy250_312_native-macosx_10_13_x86_64.whl", hash = "sha256:2baaa092f3475f3a9c87ac5198023918ea8b6c125f4c930752ab2cbe3cd1d520", size = 38746, upload-time = "2025-12-06T13:26:25.869Z" },
+    { url = "https://files.pythonhosted.org/packages/11/94/f1a07402870388fdfc2ecec0c718111189732f7d0f2d7fe1386e19e8fad0/pybase64-1.4.3-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:cde13c0764b1af07a631729f26df019070dad759981d6975527b7e8ecb465b6c", size = 32573, upload-time = "2025-12-06T13:26:27.792Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/8f/43c3bb11ca9bacf81cb0b7a71500bb65b2eda6d5fe07433c09b543de97f3/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5c29a582b0ea3936d02bd6fe9bf674ab6059e6e45ab71c78404ab2c913224414", size = 43461, upload-time = "2025-12-06T13:26:28.906Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/4c/2a5258329200be57497d3972b5308558c6de42e3749c6cc2aa1cbe34b25a/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6b664758c804fa919b4f1257aa8cf68e95db76fc331de5f70bfc3a34655afe1", size = 36058, upload-time = "2025-12-06T13:26:30.092Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/6d/41faa414cde66ec023b0ca8402a8f11cb61731c3dc27c082909cbbd1f929/pybase64-1.4.3-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:f7537fa22ae56a0bf51e4b0ffc075926ad91c618e1416330939f7ef366b58e3b", size = 36231, upload-time = "2025-12-06T13:26:31.656Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/cf/6e712491bd665ea8633efb0b484121893ea838d8e830e06f39f2aae37e58/pybase64-1.4.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94cf50c36bb2f8618982ee5a978c4beed9db97d35944fa96e8586dd953c7994a", size = 38007, upload-time = "2025-12-06T13:26:32.804Z" },
+    { url = "https://files.pythonhosted.org/packages/38/c0/9272cae1c49176337dcdbd97511e2843faae1aaf5a5fb48569093c6cd4ce/pybase64-1.4.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:01bc3ff5ca1341685c6d2d945b035f442f7b9c3b068a5c6ee8408a41fda5754e", size = 31538, upload-time = "2025-12-06T13:26:34.001Z" },
+    { url = "https://files.pythonhosted.org/packages/20/f2/17546f97befe429c73f622bbd869ceebb518c40fdb0dec4c4f98312e80a5/pybase64-1.4.3-pp310-pypy310_pp73-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:03d0aa3761a99034960496280c02aa063f856a3cc9b33771bc4eab0e4e72b5c2", size = 40682, upload-time = "2025-12-06T13:26:35.168Z" },
+    { url = "https://files.pythonhosted.org/packages/92/a0/464b36d5dfb61f3da17858afaeaa876a9342d58e9f17803ce7f28b5de9e8/pybase64-1.4.3-pp310-pypy310_pp73-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7ca5b1ce768520acd6440280cdab35235b27ad2faacfcec064bc9c3377066ef1", size = 41306, upload-time = "2025-12-06T13:26:36.351Z" },
+    { url = "https://files.pythonhosted.org/packages/07/c9/a748dfc0969a8d960ecf1e82c8a2a16046ffec22f8e7ece582aa3b1c6cf9/pybase64-1.4.3-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3caa1e2ddad1c50553ffaaa1c86b74b3f9fbd505bea9970326ab88fc68c4c184", size = 35452, upload-time = "2025-12-06T13:26:37.772Z" },
+    { url = "https://files.pythonhosted.org/packages/95/b7/4d37bd3577d1aa6c732dc099087fe027c48873e223de3784b095e5653f8b/pybase64-1.4.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bd47076f736b27a8b0f9b30d93b6bb4f5af01b0dc8971f883ed3b75934f39a99", size = 36125, upload-time = "2025-12-06T13:26:39.78Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/76/160dded493c00d3376d4ad0f38a2119c5345de4a6693419ad39c3565959b/pybase64-1.4.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:277de6e03cc9090fb359365c686a2a3036d23aee6cd20d45d22b8c89d1247f17", size = 37939, upload-time = "2025-12-06T13:26:41.014Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/b8/a0f10be8d648d6f8f26e560d6e6955efa7df0ff1e009155717454d76f601/pybase64-1.4.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ab1dd8b1ed2d1d750260ed58ab40defaa5ba83f76a30e18b9ebd5646f6247ae5", size = 31466, upload-time = "2025-12-06T13:26:42.539Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/22/832a2f9e76cdf39b52e01e40d8feeb6a04cf105494f2c3e3126d0149717f/pybase64-1.4.3-pp311-pypy311_pp73-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:bd4d2293de9fd212e294c136cec85892460b17d24e8c18a6ba18750928037750", size = 40681, upload-time = "2025-12-06T13:26:43.782Z" },
+    { url = "https://files.pythonhosted.org/packages/12/d7/6610f34a8972415fab3bb4704c174a1cc477bffbc3c36e526428d0f3957d/pybase64-1.4.3-pp311-pypy311_pp73-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2af6d0d3a691911cc4c9a625f3ddcd3af720738c21be3d5c72de05629139d393", size = 41294, upload-time = "2025-12-06T13:26:44.936Z" },
+    { url = "https://files.pythonhosted.org/packages/64/25/ed24400948a6c974ab1374a233cb7e8af0a5373cea0dd8a944627d17c34a/pybase64-1.4.3-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5cfc8c49a28322d82242088378f8542ce97459866ba73150b062a7073e82629d", size = 35447, upload-time = "2025-12-06T13:26:46.098Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/2b/e18ee7c5ee508a82897f021c1981533eca2940b5f072fc6ed0906c03a7a7/pybase64-1.4.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:debf737e09b8bf832ba86f5ecc3d3dbd0e3021d6cd86ba4abe962d6a5a77adb3", size = 36134, upload-time = "2025-12-06T13:26:47.35Z" },
+]
+
+[[package]]
+name = "pycountry"
+version = "26.2.16"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/1d/061b9e7a48b85cfd69f33c33d2ef784a531c359399ad764243399673c8f5/pycountry-26.2.16.tar.gz", hash = "sha256:5b6027d453fcd6060112b951dd010f01f168b51b4bf8a1f1fc8c95c8d94a0801", size = 7711342, upload-time = "2026-02-17T03:42:52.367Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/42/7703bd45b62fecd44cd7d3495423097e2f7d28bc2e99e7c1af68892ab157/pycountry-26.2.16-py3-none-any.whl", hash = "sha256:115c4baf7cceaa30f59a4694d79483c9167dbce7a9de4d3d571c5f3ea77c305a", size = 8044600, upload-time = "2026-02-17T03:42:49.777Z" },
+]
+
 [[package]]
 name = "pycparser"
 version = "2.23"
@@ -3219,6 +4505,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
 ]
 
+[package.optional-dependencies]
+email = [
+    { name = "email-validator" },
+]
+
 [[package]]
 name = "pydantic-core"
 version = "2.41.5"
@@ -3295,6 +4586,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" },
 ]
 
+[[package]]
+name = "pydantic-extra-types"
+version = "2.11.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/71/dba38ee2651f84f7842206adbd2233d8bbdb59fb85e9fa14232486a8c471/pydantic_extra_types-2.11.1.tar.gz", hash = "sha256:46792d2307383859e923d8fcefa82108b1a141f8a9c0198982b3832ab5ef1049", size = 172002, upload-time = "2026-03-16T08:08:03.92Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/17/c1/3226e6d7f5a4f736f38ac11a6fbb262d701889802595cdb0f53a885ac2e0/pydantic_extra_types-2.11.1-py3-none-any.whl", hash = "sha256:1722ea2bddae5628ace25f2aa685b69978ef533123e5638cfbddb999e0100ec1", size = 79526, upload-time = "2026-03-16T08:08:02.533Z" },
+]
+
+[package.optional-dependencies]
+pycountry = [
+    { name = "pycountry" },
+]
+
 [[package]]
 name = "pydantic-settings"
 version = "2.12.0"
@@ -3467,6 +4776,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
 ]
 
+[[package]]
+name = "python-json-logger"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f7/ff/3cc9165fd44106973cd7ac9facb674a65ed853494592541d339bdc9a30eb/python_json_logger-4.1.0.tar.gz", hash = "sha256:b396b9e3ed782b09ff9d6e4f1683d46c83ad0d35d2e407c09a9ebbf038f88195", size = 17573, upload-time = "2026-03-29T04:39:56.805Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/be/0631a861af4d1c875f096c07d34e9a63639560a717130e7a87cbc82b7e3f/python_json_logger-4.1.0-py3-none-any.whl", hash = "sha256:132994765cf75bf44554be9aa49b06ef2345d23661a96720262716438141b6b2", size = 15021, upload-time = "2026-03-29T04:39:55.266Z" },
+]
+
 [[package]]
 name = "python-multipart"
 version = "0.0.22"
@@ -3588,6 +4906,41 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" },
 ]
 
+[[package]]
+name = "ray"
+version = "2.55.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "filelock" },
+    { name = "jsonschema" },
+    { name = "msgpack" },
+    { name = "packaging" },
+    { name = "protobuf", version = "4.25.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "protobuf", version = "5.29.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml" },
+    { name = "requests" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/d0/a85097dd53aaca1a44acc4dd0b3d2c0e9233179433e2ee326e4018ab3cf7/ray-2.55.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d5786661e192148719accc959def6cdcabd7a24cd9008005bf3d0e3c8cfd529", size = 65829601, upload-time = "2026-04-22T20:09:10.013Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/d0/413baab5f0bdd1f913bd46538d96df3547a495b1a0de42f776b5c80d821c/ray-2.55.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:baf2ec89df7838cabdef493ff9bdbec1e6a6452f8bc696ad0c1b8a6198721745", size = 72776751, upload-time = "2026-04-22T20:09:17.802Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/64/640f5525bac171282c6f76f3ecc9c4cfef60149ac0d00231afb22018ebe5/ray-2.55.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:bb49fbbe53a1d931e1f92d17f9271338f0b738885f8f70b7f531aa33f019d8af", size = 73606971, upload-time = "2026-04-22T20:09:23.912Z" },
+    { url = "https://files.pythonhosted.org/packages/31/9a/917f25438d802e23cee2bd1426f1e36ae19e0d0e41908d50937e0a4b7fd4/ray-2.55.1-cp310-cp310-win_amd64.whl", hash = "sha256:86e618e9ad8c6a24331c788eb599cee9838a62d2e10dfca0227743be06cf551c", size = 27886803, upload-time = "2026-04-22T20:09:28.747Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7d/48ba2f49b40a34b0071ee27c0144a2573d8836094eaca213d59cef12c271/ray-2.55.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:0053fd5b400f7ac56263aa1bbd3d68fb79341b08b8dc697c88782d5aca7b3ed4", size = 65835271, upload-time = "2026-04-22T20:09:34.984Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/a3/d6db3a428e4ea17cc72e79f747cfe11e90e63e36e1705bb8324e45f334b7/ray-2.55.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:0ea2f670a7725833ad2333a8c46ab69865ad06c8e5de9f65695e0f8f35331cec", size = 72879783, upload-time = "2026-04-22T20:09:40.986Z" },
+    { url = "https://files.pythonhosted.org/packages/46/59/41da0e72a59cd3e8978480ccfeb86ef4235ae5ceb9b8928168a764fa930a/ray-2.55.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:d5382da181c03ee2f502ef46cf0ae4bbc30157b5bd9a67d7651f6a272528a85a", size = 73706515, upload-time = "2026-04-22T20:09:47.079Z" },
+    { url = "https://files.pythonhosted.org/packages/65/52/c16bbdc3e31a5178f97be88966ab56db6f7e04882640c5cf2fee5b87757b/ray-2.55.1-cp311-cp311-win_amd64.whl", hash = "sha256:5e56d2e8f304cafe990c198a2b894f5b813de018998cd7212869201f6dc17cff", size = 27882093, upload-time = "2026-04-22T20:09:52.943Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/3a/4d34f471a68b958b7f94c974c19ad6836a61a2dc16393df4294169a2e4b0/ray-2.55.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:137f9006eee28caab8260803cca314f37bbda3fc94fdfa31c770b5d019626ad8", size = 65822379, upload-time = "2026-04-22T20:09:58.064Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/13/0db535102d0256b350ca116d8987588aca1a1f9ebb4638e1e1ff88bbcef8/ray-2.55.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:26541f69bb55607ef8335baac75b2ed12ff2ce02d56313219b29eda003039221", size = 72910802, upload-time = "2026-04-22T20:10:04.382Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/f8/fffadf3f4285eebd460e4d7f2ed1c0cd641ed89613c3f49eb881ee9fa7e2/ray-2.55.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:263705f6bab29e7622a94f82da25fd7f9cead76cdf89a07aab28f79cdf8f9d95", size = 73765203, upload-time = "2026-04-22T20:10:10.495Z" },
+    { url = "https://files.pythonhosted.org/packages/10/f7/5acb86fc9625a0e6bbc40e1c7d42c60770e78585439a921c32738b6d675a/ray-2.55.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ad56704c8bd7e92130162f9c58e4ef473609515637673d5a36e761f95335206", size = 27865547, upload-time = "2026-04-22T20:10:15.364Z" },
+]
+
+[package.optional-dependencies]
+cgraph = [
+    { name = "cupy-cuda12x", marker = "sys_platform != 'darwin'" },
+]
+
 [[package]]
 name = "referencing"
 version = "0.37.0"
@@ -3668,6 +5021,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
 ]
 
+[[package]]
+name = "requests-toolbelt"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" },
+]
+
 [[package]]
 name = "rich"
 version = "14.2.0"
@@ -3681,6 +5046,96 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" },
 ]
 
+[[package]]
+name = "rich-toolkit"
+version = "0.20.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "rich" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/29/63/3e427c62f1992945c997d4ec31e2fcb37d26aadbe5aa44ae5b29f7f64d26/rich_toolkit-0.20.1.tar.gz", hash = "sha256:c7336ae281f435c785acecaedc4b71d4b663dc73d9c8079fea96372527e822a4", size = 203473, upload-time = "2026-06-05T08:56:57.679Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/88/309f07d08155da2ba1d5ceb42d270fb42fbe34a807684543e3ffc10fe713/rich_toolkit-0.20.1-py3-none-any.whl", hash = "sha256:2a6d5f8e15759b9eba5a9ee63da10b275359ead20e5a0fc92bd5b4dbae8ce4bf", size = 35525, upload-time = "2026-06-05T08:56:58.586Z" },
+]
+
+[[package]]
+name = "rignore"
+version = "0.7.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e5/f5/8bed2310abe4ae04b67a38374a4d311dd85220f5d8da56f47ae9361be0b0/rignore-0.7.6.tar.gz", hash = "sha256:00d3546cd793c30cb17921ce674d2c8f3a4b00501cb0e3dd0e82217dbeba2671", size = 57140, upload-time = "2025-11-05T21:41:21.968Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/86/7a/b970cd0138b0ece72eb28f086e933f9ed75b795716ad3de5ab22994b3b54/rignore-0.7.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f3c74a7e5ee77aea669c95fdb3933f2a6c7549893700082e759128a29cf67e45", size = 884999, upload-time = "2025-11-05T20:42:38.373Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/05/23faca29616d8966ada63fb0e13c214107811fa9a0aba2275e4c7ca63bd5/rignore-0.7.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7202404958f5fe3474bac91f65350f0b1dde1a5e05089f2946549b7e91e79ec", size = 824824, upload-time = "2025-11-05T20:42:22.1Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/2e/05a1e61f04cf2548524224f0b5f21ca19ea58f7273a863bac10846b8ff69/rignore-0.7.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bde7c5835fa3905bfb7e329a4f1d7eccb676de63da7a3f934ddd5c06df20597", size = 899121, upload-time = "2025-11-05T20:40:48.94Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/35/71518847e10bdbf359badad8800e4681757a01f4777b3c5e03dbde8a42d8/rignore-0.7.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:626c3d4ba03af266694d25101bc1d8d16eda49c5feb86cedfec31c614fceca7d", size = 873813, upload-time = "2025-11-05T20:41:04.71Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/c8/32ae405d3e7fd4d9f9b7838f2fcca0a5005bb87fa514b83f83fd81c0df22/rignore-0.7.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0a43841e651e7a05a4274b9026cc408d1912e64016ede8cd4c145dae5d0635be", size = 1168019, upload-time = "2025-11-05T20:41:20.723Z" },
+    { url = "https://files.pythonhosted.org/packages/25/98/013c955982bc5b4719bf9a5bea58be317eea28aa12bfd004025e3cd7c000/rignore-0.7.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7978c498dbf7f74d30cdb8859fe612167d8247f0acd377ae85180e34490725da", size = 942822, upload-time = "2025-11-05T20:41:36.99Z" },
+    { url = "https://files.pythonhosted.org/packages/90/fb/9a3f3156c6ed30bcd597e63690353edac1fcffe9d382ad517722b56ac195/rignore-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d22f72ab695c07d2d96d2a645208daff17084441b5d58c07378c9dd6f9c4c87", size = 959820, upload-time = "2025-11-05T20:42:06.364Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/b2/93bf609633021e9658acaff24cfb055d8cdaf7f5855d10ebb35307900dda/rignore-0.7.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5bd8e1a91ed1a789b2cbe39eeea9204a6719d4f2cf443a9544b521a285a295f", size = 985050, upload-time = "2025-11-05T20:41:51.124Z" },
+    { url = "https://files.pythonhosted.org/packages/69/bc/ec2d040469bdfd7b743df10f2201c5d285009a4263d506edbf7a06a090bb/rignore-0.7.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bc1fc03efad5789365018e94ac4079f851a999bc154d1551c45179f7fcf45322", size = 1079164, upload-time = "2025-11-05T21:40:10.368Z" },
+    { url = "https://files.pythonhosted.org/packages/df/26/4b635f4ea5baf4baa8ba8eee06163f6af6e76dfbe72deb57da34bb24b19d/rignore-0.7.6-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:ce2617fe28c51367fd8abfd4eeea9e61664af63c17d4ea00353d8ef56dfb95fa", size = 1139028, upload-time = "2025-11-05T21:40:27.977Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/54/a3147ebd1e477b06eb24e2c2c56d951ae5faa9045b7b36d7892fec5080d9/rignore-0.7.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:7c4ad2cee85068408e7819a38243043214e2c3047e9bd4c506f8de01c302709e", size = 1119024, upload-time = "2025-11-05T21:40:45.148Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/f4/27475db769a57cff18fe7e7267b36e6cdb5b1281caa185ba544171106cba/rignore-0.7.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:02cd240bfd59ecc3907766f4839cbba20530a2e470abca09eaa82225e4d946fb", size = 1128531, upload-time = "2025-11-05T21:41:02.734Z" },
+    { url = "https://files.pythonhosted.org/packages/97/32/6e782d3b352e4349fa0e90bf75b13cb7f11d8908b36d9e2b262224b65d9a/rignore-0.7.6-cp310-cp310-win32.whl", hash = "sha256:fe2bd8fa1ff555259df54c376abc73855cb02628a474a40d51b358c3a1ddc55b", size = 646817, upload-time = "2025-11-05T21:41:47.51Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/8a/53185c69abb3bb362e8a46b8089999f820bf15655629ff8395107633c8ab/rignore-0.7.6-cp310-cp310-win_amd64.whl", hash = "sha256:d80afd6071c78baf3765ec698841071b19e41c326f994cfa69b5a1df676f5d39", size = 727001, upload-time = "2025-11-05T21:41:32.778Z" },
+    { url = "https://files.pythonhosted.org/packages/25/41/b6e2be3069ef3b7f24e35d2911bd6deb83d20ed5642ad81d5a6d1c015473/rignore-0.7.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:40be8226e12d6653abbebaffaea2885f80374c1c8f76fe5ca9e0cadd120a272c", size = 885285, upload-time = "2025-11-05T20:42:39.763Z" },
+    { url = "https://files.pythonhosted.org/packages/52/66/ba7f561b6062402022887706a7f2b2c2e2e2a28f1e3839202b0a2f77e36d/rignore-0.7.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182f4e5e4064d947c756819446a7d4cdede8e756b8c81cf9e509683fe38778d7", size = 823882, upload-time = "2025-11-05T20:42:23.488Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/81/4087453df35a90b07370647b19017029324950c1b9137d54bf1f33843f17/rignore-0.7.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16b63047648a916a87be1e51bb5c009063f1b8b6f5afe4f04f875525507e63dc", size = 899362, upload-time = "2025-11-05T20:40:51.111Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/c9/390a8fdfabb76d71416be773bd9f162977bd483084f68daf19da1dec88a6/rignore-0.7.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ba5524f5178deca4d7695e936604ebc742acb8958f9395776e1fcb8133f8257a", size = 873633, upload-time = "2025-11-05T20:41:06.193Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c9/79404fcb0faa76edfbc9df0901f8ef18568d1104919ebbbad6d608c888d1/rignore-0.7.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:62020dbb89a1dd4b84ab3d60547b3b2eb2723641d5fb198463643f71eaaed57d", size = 1167633, upload-time = "2025-11-05T20:41:22.491Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/8d/b3466d32d445d158a0aceb80919085baaae495b1f540fb942f91d93b5e5b/rignore-0.7.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b34acd532769d5a6f153a52a98dcb81615c949ab11697ce26b2eb776af2e174d", size = 941434, upload-time = "2025-11-05T20:41:38.151Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/40/9cd949761a7af5bc27022a939c91ff622d29c7a0b66d0c13a863097dde2d/rignore-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c5e53b752f9de44dff7b3be3c98455ce3bf88e69d6dc0cf4f213346c5e3416c", size = 959461, upload-time = "2025-11-05T20:42:08.476Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/87/1e1a145731f73bdb7835e11f80da06f79a00d68b370d9a847de979575e6d/rignore-0.7.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:25b3536d13a5d6409ce85f23936f044576eeebf7b6db1d078051b288410fc049", size = 985323, upload-time = "2025-11-05T20:41:52.735Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/31/1ecff992fc3f59c4fcdcb6c07d5f6c1e6dfb55ccda19c083aca9d86fa1c6/rignore-0.7.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6e01cad2b0b92f6b1993f29fc01f23f2d78caf4bf93b11096d28e9d578eb08ce", size = 1079173, upload-time = "2025-11-05T21:40:12.007Z" },
+    { url = "https://files.pythonhosted.org/packages/17/18/162eedadb4c2282fa4c521700dbf93c9b14b8842e8354f7d72b445b8d593/rignore-0.7.6-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5991e46ab9b4868334c9e372ab0892b0150f3f586ff2b1e314272caeb38aaedb", size = 1139012, upload-time = "2025-11-05T21:40:29.399Z" },
+    { url = "https://files.pythonhosted.org/packages/78/96/a9ca398a8af74bb143ad66c2a31303c894111977e28b0d0eab03867f1b43/rignore-0.7.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6c8ae562e5d1246cba5eaeb92a47b2a279e7637102828dde41dcbe291f529a3e", size = 1118827, upload-time = "2025-11-05T21:40:46.6Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/22/1c1a65047df864def9a047dbb40bc0b580b8289a4280e62779cd61ae21f2/rignore-0.7.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:aaf938530dcc0b47c4cfa52807aa2e5bfd5ca6d57a621125fe293098692f6345", size = 1128182, upload-time = "2025-11-05T21:41:04.239Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/f4/1526eb01fdc2235aca1fd9d0189bee4021d009a8dcb0161540238c24166e/rignore-0.7.6-cp311-cp311-win32.whl", hash = "sha256:166ebce373105dd485ec213a6a2695986346e60c94ff3d84eb532a237b24a4d5", size = 646547, upload-time = "2025-11-05T21:41:49.439Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/c8/dda0983e1845706beb5826459781549a840fe5a7eb934abc523e8cd17814/rignore-0.7.6-cp311-cp311-win_amd64.whl", hash = "sha256:44f35ee844b1a8cea50d056e6a595190ce9d42d3cccf9f19d280ae5f3058973a", size = 727139, upload-time = "2025-11-05T21:41:34.367Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/47/eb1206b7bf65970d41190b879e1723fc6bbdb2d45e53565f28991a8d9d96/rignore-0.7.6-cp311-cp311-win_arm64.whl", hash = "sha256:14b58f3da4fa3d5c3fa865cab49821675371f5e979281c683e131ae29159a581", size = 657598, upload-time = "2025-11-05T21:41:23.758Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/0e/012556ef3047a2628842b44e753bb15f4dc46806780ff090f1e8fe4bf1eb/rignore-0.7.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:03e82348cb7234f8d9b2834f854400ddbbd04c0f8f35495119e66adbd37827a8", size = 883488, upload-time = "2025-11-05T20:42:41.359Z" },
+    { url = "https://files.pythonhosted.org/packages/93/b0/d4f1f3fe9eb3f8e382d45ce5b0547ea01c4b7e0b4b4eb87bcd66a1d2b888/rignore-0.7.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9e624f6be6116ea682e76c5feb71ea91255c67c86cb75befe774365b2931961", size = 820411, upload-time = "2025-11-05T20:42:24.782Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/c8/dea564b36dedac8de21c18e1851789545bc52a0c22ece9843444d5608a6a/rignore-0.7.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bda49950d405aa8d0ebe26af807c4e662dd281d926530f03f29690a2e07d649a", size = 897821, upload-time = "2025-11-05T20:40:52.613Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/2b/ee96db17ac1835e024c5d0742eefb7e46de60020385ac883dd3d1cde2c1f/rignore-0.7.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5fd5ab3840b8c16851d327ed06e9b8be6459702a53e5ab1fc4073b684b3789e", size = 873963, upload-time = "2025-11-05T20:41:07.49Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8c/ad5a57bbb9d14d5c7e5960f712a8a0b902472ea3f4a2138cbf70d1777b75/rignore-0.7.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ced2a248352636a5c77504cb755dc02c2eef9a820a44d3f33061ce1bb8a7f2d2", size = 1169216, upload-time = "2025-11-05T20:41:23.73Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e6/5b00bc2a6bc1701e6878fca798cf5d9125eb3113193e33078b6fc0d99123/rignore-0.7.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a04a3b73b75ddc12c9c9b21efcdaab33ca3832941d6f1d67bffd860941cd448a", size = 942942, upload-time = "2025-11-05T20:41:39.393Z" },
+    { url = "https://files.pythonhosted.org/packages/85/e5/7f99bd0cc9818a91d0e8b9acc65b792e35750e3bdccd15a7ee75e64efca4/rignore-0.7.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d24321efac92140b7ec910ac7c53ab0f0c86a41133d2bb4b0e6a7c94967f44dd", size = 959787, upload-time = "2025-11-05T20:42:09.765Z" },
+    { url = "https://files.pythonhosted.org/packages/55/54/2ffea79a7c1eabcede1926347ebc2a81bc6b81f447d05b52af9af14948b9/rignore-0.7.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73c7aa109d41e593785c55fdaa89ad80b10330affa9f9d3e3a51fa695f739b20", size = 984245, upload-time = "2025-11-05T20:41:54.062Z" },
+    { url = "https://files.pythonhosted.org/packages/41/f7/e80f55dfe0f35787fa482aa18689b9c8251e045076c35477deb0007b3277/rignore-0.7.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1734dc49d1e9501b07852ef44421f84d9f378da9fbeda729e77db71f49cac28b", size = 1078647, upload-time = "2025-11-05T21:40:13.463Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/cf/2c64f0b6725149f7c6e7e5a909d14354889b4beaadddaa5fff023ec71084/rignore-0.7.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5719ea14ea2b652c0c0894be5dfde954e1853a80dea27dd2fbaa749618d837f5", size = 1139186, upload-time = "2025-11-05T21:40:31.27Z" },
+    { url = "https://files.pythonhosted.org/packages/75/95/a86c84909ccc24af0d094b50d54697951e576c252a4d9f21b47b52af9598/rignore-0.7.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e23424fc7ce35726854f639cb7968151a792c0c3d9d082f7f67e0c362cfecca", size = 1117604, upload-time = "2025-11-05T21:40:48.07Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/5e/13b249613fd5d18d58662490ab910a9f0be758981d1797789913adb4e918/rignore-0.7.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3efdcf1dd84d45f3e2bd2f93303d9be103888f56dfa7c3349b5bf4f0657ec696", size = 1127725, upload-time = "2025-11-05T21:41:05.804Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/28/fa5dcd1e2e16982c359128664e3785f202d3eca9b22dd0b2f91c4b3d242f/rignore-0.7.6-cp312-cp312-win32.whl", hash = "sha256:ccca9d1a8b5234c76b71546fc3c134533b013f40495f394a65614a81f7387046", size = 646145, upload-time = "2025-11-05T21:41:51.096Z" },
+    { url = "https://files.pythonhosted.org/packages/26/87/69387fb5dd81a0f771936381431780b8cf66fcd2cfe9495e1aaf41548931/rignore-0.7.6-cp312-cp312-win_amd64.whl", hash = "sha256:c96a285e4a8bfec0652e0bfcf42b1aabcdda1e7625f5006d188e3b1c87fdb543", size = 726090, upload-time = "2025-11-05T21:41:36.485Z" },
+    { url = "https://files.pythonhosted.org/packages/24/5f/e8418108dcda8087fb198a6f81caadbcda9fd115d61154bf0df4d6d3619b/rignore-0.7.6-cp312-cp312-win_arm64.whl", hash = "sha256:a64a750e7a8277a323f01ca50b7784a764845f6cce2fe38831cb93f0508d0051", size = 656317, upload-time = "2025-11-05T21:41:25.305Z" },
+    { url = "https://files.pythonhosted.org/packages/85/12/62d690b4644c330d7ac0f739b7f078190ab4308faa909a60842d0e4af5b2/rignore-0.7.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3d3a523af1cd4ed2c0cba8d277a32d329b0c96ef9901fb7ca45c8cfaccf31a5", size = 887462, upload-time = "2025-11-05T20:42:50.804Z" },
+    { url = "https://files.pythonhosted.org/packages/05/bc/6528a0e97ed2bd7a7c329183367d1ffbc5b9762ae8348d88dae72cc9d1f5/rignore-0.7.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:990853566e65184a506e1e2af2d15045afad3ebaebb8859cb85b882081915110", size = 826918, upload-time = "2025-11-05T20:42:33.689Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/2c/7d7bad116e09a04e9e1688c6f891fa2d4fd33f11b69ac0bd92419ddebeae/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cab9ff2e436ce7240d7ee301c8ef806ed77c1fd6b8a8239ff65f9bbbcb5b8a3", size = 900922, upload-time = "2025-11-05T20:41:00.361Z" },
+    { url = "https://files.pythonhosted.org/packages/09/ba/e5ea89fbde8e37a90ce456e31c5e9d85512cef5ae38e0f4d2426eb776a19/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d1a6671b2082c13bfd9a5cf4ce64670f832a6d41470556112c4ab0b6519b2fc4", size = 876987, upload-time = "2025-11-05T20:41:16.219Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/fb/93d14193f0ec0c3d35b763f0a000e9780f63b2031f3d3756442c2152622d/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2468729b4c5295c199d084ab88a40afcb7c8b974276805105239c07855bbacee", size = 1171110, upload-time = "2025-11-05T20:41:32.631Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/46/08436312ff96ffa29cfa4e1a987efc37e094531db46ba5e9fda9bb792afd/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:775710777fd71e5fdf54df69cdc249996a1d6f447a2b5bfb86dbf033fddd9cf9", size = 943339, upload-time = "2025-11-05T20:41:47.128Z" },
+    { url = "https://files.pythonhosted.org/packages/34/28/3b3c51328f505cfaf7e53f408f78a1e955d561135d02f9cb0341ea99f69a/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4565407f4a77f72cf9d91469e75d15d375f755f0a01236bb8aaa176278cc7085", size = 961680, upload-time = "2025-11-05T20:42:18.061Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/9e/cbff75c8676d4f4a90bd58a1581249d255c7305141b0868f0abc0324836b/rignore-0.7.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc44c33f8fb2d5c9da748de7a6e6653a78aa740655e7409895e94a247ffa97c8", size = 987045, upload-time = "2025-11-05T20:42:02.315Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/25/d802d1d369502a7ddb8816059e7c79d2d913e17df975b863418e0aca4d8a/rignore-0.7.6-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8f32478f05540513c11923e8838afab9efef0131d66dca7f67f0e1bbd118af6a", size = 1080310, upload-time = "2025-11-05T21:40:23.184Z" },
+    { url = "https://files.pythonhosted.org/packages/43/f0/250b785c2e473b1ab763eaf2be820934c2a5409a722e94b279dddac21c7d/rignore-0.7.6-pp310-pypy310_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:1b63a3dd76225ea35b01dd6596aa90b275b5d0f71d6dc28fce6dd295d98614aa", size = 1140998, upload-time = "2025-11-05T21:40:40.603Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/d6/bb42fd2a8bba6aea327962656e20621fd495523259db40cfb4c5f760f05c/rignore-0.7.6-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:fe6c41175c36554a4ef0994cd1b4dbd6d73156fca779066456b781707402048e", size = 1121178, upload-time = "2025-11-05T21:40:57.585Z" },
+    { url = "https://files.pythonhosted.org/packages/97/f4/aeb548374129dce3dc191a4bb598c944d9ed663f467b9af830315d86059c/rignore-0.7.6-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9a0c6792406ae36f4e7664dc772da909451d46432ff8485774526232d4885063", size = 1130190, upload-time = "2025-11-05T21:41:16.403Z" },
+    { url = "https://files.pythonhosted.org/packages/82/78/a6250ff0c49a3cdb943910ada4116e708118e9b901c878cfae616c80a904/rignore-0.7.6-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a20b6fb61bcced9a83dfcca6599ad45182b06ba720cff7c8d891e5b78db5b65f", size = 886470, upload-time = "2025-11-05T20:42:52.314Z" },
+    { url = "https://files.pythonhosted.org/packages/35/af/c69c0c51b8f9f7914d95c4ea91c29a2ac067572048cae95dd6d2efdbe05d/rignore-0.7.6-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:392dcabfecbe176c9ebbcb40d85a5e86a5989559c4f988c2741da7daf1b5be25", size = 825976, upload-time = "2025-11-05T20:42:35.118Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/d2/1b264f56132264ea609d3213ab603d6a27016b19559a1a1ede1a66a03dcd/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22baa462abdc36fdd5a5e2dae423107723351b85ff093762f9261148b9d0a04a", size = 899739, upload-time = "2025-11-05T20:41:01.518Z" },
+    { url = "https://files.pythonhosted.org/packages/55/e4/b3c5dfdd8d8a10741dfe7199ef45d19a0e42d0c13aa377c83bd6caf65d90/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53fb28882d2538cb2d231972146c4927a9d9455e62b209f85d634408c4103538", size = 874843, upload-time = "2025-11-05T20:41:17.687Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/10/d6f3750233881a2a154cefc9a6a0a9b19da526b19f7f08221b552c6f827d/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87409f7eeb1103d6b77f3472a3a0d9a5953e3ae804a55080bdcb0120ee43995b", size = 1170348, upload-time = "2025-11-05T20:41:34.21Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/10/ad98ca05c9771c15af734cee18114a3c280914b6e34fde9ffea2e61e88aa/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:684014e42e4341ab3ea23a203551857fcc03a7f8ae96ca3aefb824663f55db32", size = 942315, upload-time = "2025-11-05T20:41:48.508Z" },
+    { url = "https://files.pythonhosted.org/packages/de/00/ab5c0f872acb60d534e687e629c17e0896c62da9b389c66d3aa16b817aa8/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77356ebb01ba13f8a425c3d30fcad40e57719c0e37670d022d560884a30e4767", size = 961047, upload-time = "2025-11-05T20:42:19.403Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/86/3030fdc363a8f0d1cd155b4c453d6db9bab47a24fcc64d03f61d9d78fe6a/rignore-0.7.6-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6cbd8a48abbd3747a6c830393cd578782fab5d43f4deea48c5f5e344b8fed2b0", size = 986090, upload-time = "2025-11-05T20:42:03.581Z" },
+    { url = "https://files.pythonhosted.org/packages/33/b8/133aa4002cee0ebbb39362f94e4898eec7fbd09cec9fcbce1cd65b355b7f/rignore-0.7.6-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2673225dcec7f90497e79438c35e34638d0d0391ccea3cbb79bfb9adc0dc5bd7", size = 1079656, upload-time = "2025-11-05T21:40:24.89Z" },
+    { url = "https://files.pythonhosted.org/packages/67/56/36d5d34210e5e7dfcd134eed8335b19e80ae940ee758f493e4f2b344dd70/rignore-0.7.6-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:c081f17290d8a2b96052b79207622aa635686ea39d502b976836384ede3d303c", size = 1139789, upload-time = "2025-11-05T21:40:42.119Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/5b/bb4f9420802bf73678033a4a55ab1bede36ce2e9b41fec5f966d83d932b3/rignore-0.7.6-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:57e8327aacc27f921968cb2a174f9e47b084ce9a7dd0122c8132d22358f6bd79", size = 1120308, upload-time = "2025-11-05T21:40:59.402Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/8b/a1299085b28a2f6135e30370b126e3c5055b61908622f2488ade67641479/rignore-0.7.6-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:d8955b57e42f2a5434670d5aa7b75eaf6e74602ccd8955dddf7045379cd762fb", size = 1129444, upload-time = "2025-11-05T21:41:17.906Z" },
+]
+
 [[package]]
 name = "rpds-py"
 version = "0.30.0"
@@ -3948,6 +5403,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/02/c5e3bc518655d714622bec87d83db9cdba1cd0619a4a04e2109751c4f47f/sentencepiece-0.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:daeb5e9e9fcad012324807856113708614d534f596d5008638eb9b40112cd9e4", size = 1033923, upload-time = "2025-08-12T06:59:51.952Z" },
 ]
 
+[[package]]
+name = "sentry-sdk"
+version = "2.63.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/c8/b3c970a5b186722d276cd40a05b3254e03bccc0208560aff20f612e018e8/sentry_sdk-2.63.0.tar.gz", hash = "sha256:2a1502bf864769275dbc8c2c9fc7a0f7f5e18358180b615d262d13a31ffba216", size = 912449, upload-time = "2026-06-16T12:45:57.553Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/57/cb205f7d93373120f666b9c5736dc0815524d96a9b278e7a728f018dc22a/sentry_sdk-2.63.0-py3-none-any.whl", hash = "sha256:3a9b5ddd403f79eb73bd670f75f04485819db53d28f76ced7bc09041cb0dfd6a", size = 495950, upload-time = "2026-06-16T12:45:55.819Z" },
+]
+
 [[package]]
 name = "seqeval"
 version = "1.2.2"
@@ -3959,6 +5427,50 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f", size = 43605, upload-time = "2020-10-24T00:24:54.926Z" }
 
+[[package]]
+name = "setproctitle"
+version = "1.3.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8d/48/49393a96a2eef1ab418b17475fb92b8fcfad83d099e678751b05472e69de/setproctitle-1.3.7.tar.gz", hash = "sha256:bc2bc917691c1537d5b9bca1468437176809c7e11e5694ca79a9ca12345dcb9e", size = 27002, upload-time = "2025-09-05T12:51:25.278Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/48/fb401ec8c4953d519d05c87feca816ad668b8258448ff60579ac7a1c1386/setproctitle-1.3.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cf555b6299f10a6eb44e4f96d2f5a3884c70ce25dc5c8796aaa2f7b40e72cb1b", size = 18079, upload-time = "2025-09-05T12:49:07.732Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/a3/c2b0333c2716fb3b4c9a973dd113366ac51b4f8d56b500f4f8f704b4817a/setproctitle-1.3.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:690b4776f9c15aaf1023bb07d7c5b797681a17af98a4a69e76a1d504e41108b7", size = 13099, upload-time = "2025-09-05T12:49:09.222Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/f8/17bda581c517678260e6541b600eeb67745f53596dc077174141ba2f6702/setproctitle-1.3.7-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:00afa6fc507967d8c9d592a887cdc6c1f5742ceac6a4354d111ca0214847732c", size = 31793, upload-time = "2025-09-05T12:49:10.297Z" },
+    { url = "https://files.pythonhosted.org/packages/27/d1/76a33ae80d4e788ecab9eb9b53db03e81cfc95367ec7e3fbf4989962fedd/setproctitle-1.3.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9e02667f6b9fc1238ba753c0f4b0a37ae184ce8f3bbbc38e115d99646b3f4cd3", size = 32779, upload-time = "2025-09-05T12:49:12.157Z" },
+    { url = "https://files.pythonhosted.org/packages/59/27/1a07c38121967061564f5e0884414a5ab11a783260450172d4fc68c15621/setproctitle-1.3.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:83fcd271567d133eb9532d3b067c8a75be175b2b3b271e2812921a05303a693f", size = 34578, upload-time = "2025-09-05T12:49:13.393Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/d4/725e6353935962d8bb12cbf7e7abba1d0d738c7f6935f90239d8e1ccf913/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:13fe37951dda1a45c35d77d06e3da5d90e4f875c4918a7312b3b4556cfa7ff64", size = 32030, upload-time = "2025-09-05T12:49:15.362Z" },
+    { url = "https://files.pythonhosted.org/packages/67/24/e4677ae8e1cb0d549ab558b12db10c175a889be0974c589c428fece5433e/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a05509cfb2059e5d2ddff701d38e474169e9ce2a298cf1b6fd5f3a213a553fe5", size = 33363, upload-time = "2025-09-05T12:49:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/55/d4/69ce66e4373a48fdbb37489f3ded476bb393e27f514968c3a69a67343ae0/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6da835e76ae18574859224a75db6e15c4c2aaa66d300a57efeaa4c97ca4c7381", size = 31508, upload-time = "2025-09-05T12:49:18.032Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/5a/42c1ed0e9665d068146a68326529b5686a1881c8b9197c2664db4baf6aeb/setproctitle-1.3.7-cp310-cp310-win32.whl", hash = "sha256:9e803d1b1e20240a93bac0bc1025363f7f80cb7eab67dfe21efc0686cc59ad7c", size = 12558, upload-time = "2025-09-05T12:49:19.742Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/fe/dd206cc19a25561921456f6cb12b405635319299b6f366e0bebe872abc18/setproctitle-1.3.7-cp310-cp310-win_amd64.whl", hash = "sha256:a97200acc6b64ec4cada52c2ecaf1fba1ef9429ce9c542f8a7db5bcaa9dcbd95", size = 13245, upload-time = "2025-09-05T12:49:21.023Z" },
+    { url = "https://files.pythonhosted.org/packages/04/cd/1b7ba5cad635510720ce19d7122154df96a2387d2a74217be552887c93e5/setproctitle-1.3.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a600eeb4145fb0ee6c287cb82a2884bd4ec5bbb076921e287039dcc7b7cc6dd0", size = 18085, upload-time = "2025-09-05T12:49:22.183Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/1a/b2da0a620490aae355f9d72072ac13e901a9fec809a6a24fc6493a8f3c35/setproctitle-1.3.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:97a090fed480471bb175689859532709e28c085087e344bca45cf318034f70c4", size = 13097, upload-time = "2025-09-05T12:49:23.322Z" },
+    { url = "https://files.pythonhosted.org/packages/18/2e/bd03ff02432a181c1787f6fc2a678f53b7dacdd5ded69c318fe1619556e8/setproctitle-1.3.7-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1607b963e7b53e24ec8a2cb4e0ab3ae591d7c6bf0a160feef0551da63452b37f", size = 32191, upload-time = "2025-09-05T12:49:24.567Z" },
+    { url = "https://files.pythonhosted.org/packages/28/78/1e62fc0937a8549f2220445ed2175daacee9b6764c7963b16148119b016d/setproctitle-1.3.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a20fb1a3974e2dab857870cf874b325b8705605cb7e7e8bcbb915bca896f52a9", size = 33203, upload-time = "2025-09-05T12:49:25.871Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/3c/65edc65db3fa3df400cf13b05e9d41a3c77517b4839ce873aa6b4043184f/setproctitle-1.3.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f8d961bba676e07d77665204f36cffaa260f526e7b32d07ab3df6a2c1dfb44ba", size = 34963, upload-time = "2025-09-05T12:49:27.044Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/32/89157e3de997973e306e44152522385f428e16f92f3cf113461489e1e2ee/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:db0fd964fbd3a9f8999b502f65bd2e20883fdb5b1fae3a424e66db9a793ed307", size = 32398, upload-time = "2025-09-05T12:49:28.909Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/18/77a765a339ddf046844cb4513353d8e9dcd8183da9cdba6e078713e6b0b2/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:db116850fcf7cca19492030f8d3b4b6e231278e8fe097a043957d22ce1bdf3ee", size = 33657, upload-time = "2025-09-05T12:49:30.323Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/63/f0b6205c64d74d2a24a58644a38ec77bdbaa6afc13747e75973bf8904932/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:316664d8b24a5c91ee244460bdaf7a74a707adaa9e14fbe0dc0a53168bb9aba1", size = 31836, upload-time = "2025-09-05T12:49:32.309Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/51/e1277f9ba302f1a250bbd3eedbbee747a244b3cc682eb58fb9733968f6d8/setproctitle-1.3.7-cp311-cp311-win32.whl", hash = "sha256:b74774ca471c86c09b9d5037c8451fff06bb82cd320d26ae5a01c758088c0d5d", size = 12556, upload-time = "2025-09-05T12:49:33.529Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/7b/822a23f17e9003dfdee92cd72758441ca2a3680388da813a371b716fb07f/setproctitle-1.3.7-cp311-cp311-win_amd64.whl", hash = "sha256:acb9097213a8dd3410ed9f0dc147840e45ca9797785272928d4be3f0e69e3be4", size = 13243, upload-time = "2025-09-05T12:49:34.553Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/f0/2dc88e842077719d7384d86cc47403e5102810492b33680e7dadcee64cd8/setproctitle-1.3.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2dc99aec591ab6126e636b11035a70991bc1ab7a261da428491a40b84376654e", size = 18049, upload-time = "2025-09-05T12:49:36.241Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/b4/50940504466689cda65680c9e9a1e518e5750c10490639fa687489ac7013/setproctitle-1.3.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdd8aa571b7aa39840fdbea620e308a19691ff595c3a10231e9ee830339dd798", size = 13079, upload-time = "2025-09-05T12:49:38.088Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/99/71630546b9395b095f4082be41165d1078204d1696c2d9baade3de3202d0/setproctitle-1.3.7-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2906b6c7959cdb75f46159bf0acd8cc9906cf1361c9e1ded0d065fe8f9039629", size = 32932, upload-time = "2025-09-05T12:49:39.271Z" },
+    { url = "https://files.pythonhosted.org/packages/50/22/cee06af4ffcfb0e8aba047bd44f5262e644199ae7527ae2c1f672b86495c/setproctitle-1.3.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6915964a6dda07920a1159321dcd6d94fc7fc526f815ca08a8063aeca3c204f1", size = 33736, upload-time = "2025-09-05T12:49:40.565Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/00/a5949a8bb06ef5e7df214fc393bb2fb6aedf0479b17214e57750dfdd0f24/setproctitle-1.3.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cff72899861c765bd4021d1ff1c68d60edc129711a2fdba77f9cb69ef726a8b6", size = 35605, upload-time = "2025-09-05T12:49:42.362Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3a/50caca532a9343828e3bf5778c7a84d6c737a249b1796d50dd680290594d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b7cb05bd446687ff816a3aaaf831047fc4c364feff7ada94a66024f1367b448c", size = 33143, upload-time = "2025-09-05T12:49:43.515Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/14/b843a251296ce55e2e17c017d6b9f11ce0d3d070e9265de4ecad948b913d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3a57b9a00de8cae7e2a1f7b9f0c2ac7b69372159e16a7708aa2f38f9e5cc987a", size = 34434, upload-time = "2025-09-05T12:49:45.31Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/b7/06145c238c0a6d2c4bc881f8be230bb9f36d2bf51aff7bddcb796d5eed67/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d8828b356114f6b308b04afe398ed93803d7fca4a955dd3abe84430e28d33739", size = 32795, upload-time = "2025-09-05T12:49:46.419Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/dc/ef76a81fac9bf27b84ed23df19c1f67391a753eed6e3c2254ebcb5133f56/setproctitle-1.3.7-cp312-cp312-win32.whl", hash = "sha256:b0304f905efc845829ac2bc791ddebb976db2885f6171f4a3de678d7ee3f7c9f", size = 12552, upload-time = "2025-09-05T12:49:47.635Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/5b/a9fe517912cd6e28cf43a212b80cb679ff179a91b623138a99796d7d18a0/setproctitle-1.3.7-cp312-cp312-win_amd64.whl", hash = "sha256:9888ceb4faea3116cf02a920ff00bfbc8cc899743e4b4ac914b03625bdc3c300", size = 13247, upload-time = "2025-09-05T12:49:49.16Z" },
+    { url = "https://files.pythonhosted.org/packages/34/8a/aff5506ce89bc3168cb492b18ba45573158d528184e8a9759a05a09088a9/setproctitle-1.3.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:eb440c5644a448e6203935ed60466ec8d0df7278cd22dc6cf782d07911bcbea6", size = 12654, upload-time = "2025-09-05T12:51:17.141Z" },
+    { url = "https://files.pythonhosted.org/packages/41/89/5b6f2faedd6ced3d3c085a5efbd91380fb1f61f4c12bc42acad37932f4e9/setproctitle-1.3.7-pp310-pypy310_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:502b902a0e4c69031b87870ff4986c290ebbb12d6038a70639f09c331b18efb2", size = 14284, upload-time = "2025-09-05T12:51:18.393Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/c0/4312fed3ca393a29589603fd48f17937b4ed0638b923bac75a728382e730/setproctitle-1.3.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f6f268caeabb37ccd824d749e7ce0ec6337c4ed954adba33ec0d90cc46b0ab78", size = 13282, upload-time = "2025-09-05T12:51:19.703Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/5b/5e1c117ac84e3cefcf8d7a7f6b2461795a87e20869da065a5c087149060b/setproctitle-1.3.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:b1cac6a4b0252b8811d60b6d8d0f157c0fdfed379ac89c25a914e6346cf355a1", size = 12587, upload-time = "2025-09-05T12:51:21.195Z" },
+    { url = "https://files.pythonhosted.org/packages/73/02/b9eadc226195dcfa90eed37afe56b5dd6fa2f0e5220ab8b7867b8862b926/setproctitle-1.3.7-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f1704c9e041f2b1dc38f5be4552e141e1432fba3dd52c72eeffd5bc2db04dc65", size = 14286, upload-time = "2025-09-05T12:51:22.61Z" },
+    { url = "https://files.pythonhosted.org/packages/28/26/1be1d2a53c2a91ec48fa2ff4a409b395f836798adf194d99de9c059419ea/setproctitle-1.3.7-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b08b61976ffa548bd5349ce54404bf6b2d51bd74d4f1b241ed1b0f25bce09c3a", size = 13282, upload-time = "2025-09-05T12:51:24.094Z" },
+]
+
 [[package]]
 name = "setuptools"
 version = "80.9.0"
@@ -4333,14 +5845,24 @@ wheels = [
 
 [[package]]
 name = "starlette"
-version = "0.41.3"
+version = "0.49.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
+    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/1a/4c/9b5764bd22eec91c4039ef4c55334e9187085da2d8a2df7bd570869aae18/starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835", size = 2574159, upload-time = "2024-11-18T19:45:04.283Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/1a/608df0b10b53b0beb96a37854ee05864d182ddd4b1156a22f1ad3860425a/starlette-0.49.3.tar.gz", hash = "sha256:1c14546f299b5901a1ea0e34410575bc33bbd741377a10484a54445588d00284", size = 2655031, upload-time = "2025-11-01T15:12:26.13Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a3/e0/021c772d6a662f43b63044ab481dc6ac7592447605b5b35a957785363122/starlette-0.49.3-py3-none-any.whl", hash = "sha256:b579b99715fdc2980cf88c8ec96d3bf1ce16f5a8051a7c2b84ef9b1cdecaea2f", size = 74340, upload-time = "2025-11-01T15:12:24.387Z" },
+]
+
+[[package]]
+name = "supervisor"
+version = "4.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a9/b5/37e7a3706de436a8a2d75334711dad1afb4ddffab09f25e31d89e467542f/supervisor-4.3.0.tar.gz", hash = "sha256:4a2bf149adf42997e1bb44b70c43b613275ec9852c3edacca86a9166b27e945e", size = 468912, upload-time = "2025-08-23T18:25:02.418Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/00/2b325970b3060c7cecebab6d295afe763365822b1306a12eeab198f74323/starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7", size = 73225, upload-time = "2024-11-18T19:45:02.027Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/65/5e726c372da8a5e35022a94388b12252710aad0c2351699c3d76ae8dba78/supervisor-4.3.0-py2.py3-none-any.whl", hash = "sha256:0bcb763fddafba410f35cbde226aa7f8514b9fb82eb05a0c85f6588d1c13f8db", size = 320736, upload-time = "2025-08-23T18:25:00.767Z" },
 ]
 
 [[package]]
@@ -4355,6 +5877,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
 ]
 
+[[package]]
+name = "tabulate"
+version = "0.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" },
+]
+
+[[package]]
+name = "tenacity"
+version = "9.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" },
+]
+
 [[package]]
 name = "thinc"
 version = "8.3.10"
@@ -4409,6 +5949,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
 ]
 
+[[package]]
+name = "tiktoken"
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e4/e5/5f3cb2159769d0f4324c0e9e87f9de3c4b1cd45848a96b2eb3566ad5ca77/tiktoken-0.13.0.tar.gz", hash = "sha256:c9435714c3a84c2319499de9a300c0e604449dd0799ff246458b3bb6a7f433c1", size = 38986, upload-time = "2026-05-15T04:51:27.153Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/e3/03c90dadcf5b3f82b83cee9adee60ef666b329c654f58c066af44eae0287/tiktoken-0.13.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:47b1df8d73390a24f94980c75158cdd5c56d256f16d55f30cb49c230caba9ba4", size = 1036627, upload-time = "2026-05-15T04:50:11.229Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/30/760463e5b2e8ad2bc229ae0a17ecb06727b6cbc094f08d8f65844315632e/tiktoken-0.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7d40c6c5aab171dcd6eb8455bc567bde404bb9def60cdb8c1299cc782b242bb9", size = 984699, upload-time = "2026-05-15T04:50:12.874Z" },
+    { url = "https://files.pythonhosted.org/packages/de/8a/8895f342a6b6aabd1a358e672f6f077b3ae51d0c63ca605d142db3bcd8ab/tiktoken-0.13.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:9b842981fa91accdffd48ff6408a977b7a91c3fbda55d353c3c68114d5c9d69e", size = 1118690, upload-time = "2026-05-15T04:50:14.234Z" },
+    { url = "https://files.pythonhosted.org/packages/51/e0/92557768fb0801f0d9dd9243cb9b6d342900b05e4b1006d4771f49ce233e/tiktoken-0.13.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ed5a30027cb4d8c7ca8b273d4766f3db3cf58fad9e9f3b1a68a351ffb54873d5", size = 1138423, upload-time = "2026-05-15T04:50:15.668Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/b9/a3d99feeedb032ffd09cd6652077f86bdee9a70dd0b990b2b272b445d4c3/tiktoken-0.13.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7ab10f4a21c2999846940113f6dbd72e0fa06a24119feddd74cc47e85818e06d", size = 1185077, upload-time = "2026-05-15T04:50:17.19Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/93/bab868277d475dc6d2aaacd34cdd239c282f4908dcc8702e0a3311a8e032/tiktoken-0.13.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a2937ad042d49d50eac6e1ba07c5661d4bd3942a5b1e0c0d08475c4df83676e1", size = 1241702, upload-time = "2026-05-15T04:50:18.772Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/16/27e9f7e0ed76e501cfefc9fb2112df4c7bf70ca96945b15ecb7615aac860/tiktoken-0.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:44733b99bfd72b590cd0936b1c01b3b4dd73122db2d544bc1ceeb18a7678c910", size = 876565, upload-time = "2026-05-15T04:50:20.268Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/4c/1bc81f4cd53e827c4ee67ca951b5935724716049452d8dfa09b8b82372bb/tiktoken-0.13.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:7bfe1849caa65d1e1d9871817170ec497bbb7984e182012e1bdce72f66608cdb", size = 1036353, upload-time = "2026-05-15T04:50:21.757Z" },
+    { url = "https://files.pythonhosted.org/packages/75/91/10b9c7076bc02c246c853201fdbbe300a4b8c5ed7b84c25f7403f4e32655/tiktoken-0.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:91c180fe255bd5a86d8316210d2833a1d4d33d026cd86a67812f4773743c8d26", size = 984644, upload-time = "2026-05-15T04:50:23.256Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/e4/fceae98015fab47fcd49b8bd7f46145bcd187a47e0add1e5378ed67ef980/tiktoken-0.13.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:059c8ecf554eb5b41e6e054ba467b871b03277d267dee7244380aca4359747d4", size = 1119261, upload-time = "2026-05-15T04:50:24.348Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/39/fe42ad00de01a8c4a49ad8649a2c8a316835a9cad5961b11d21eac0020a5/tiktoken-0.13.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:36217497eaffc158607a3b26f065300db2aefd43b115263f3b9688ce38146173", size = 1138253, upload-time = "2026-05-15T04:50:25.505Z" },
+    { url = "https://files.pythonhosted.org/packages/03/c4/ccee1ecccca107e9a16efcecdeeb964c325305038554d466ece65b42338f/tiktoken-0.13.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:303f7d91b4fce3baddbcde05c139091d4caa5026ac7214c1dc7ff7a71ee429ff", size = 1185747, upload-time = "2026-05-15T04:50:27.02Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/03/cd0cba295522b91eb55c6b2704f1df895f8226cfe60ab10d4d51d0cc9e69/tiktoken-0.13.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5d48843bee149630eb735a99e1f4a85b47308d21868ea63163f6e87768d3cfed", size = 1241265, upload-time = "2026-05-15T04:50:28.815Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/25/a10efd564402d82c2ff50d12057353ace447aa8007deceaa48641f63d35c/tiktoken-0.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:fc1c44cd37b43fc46bae593129164f4f281e82ea116b57a85aa81bda57eafc94", size = 876509, upload-time = "2026-05-15T04:50:30.026Z" },
+    { url = "https://files.pythonhosted.org/packages/85/8e/144bde4e01df66b34bb865557c7cd754ed08b036217ebd79c9db5e9048a9/tiktoken-0.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:32ac870a806cfb260a02d0cb70426aef02e038297f8ad50df5040bb5af360791", size = 1034888, upload-time = "2026-05-15T04:50:31.579Z" },
+    { url = "https://files.pythonhosted.org/packages/36/18/d4ac9d20956cdebca04841316660ed584c2fecdc2b81722a28bc7ad3b1e4/tiktoken-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d9980f11429ed2d737c463bb1fb78cf330caa026adf002f714aced7849a687b", size = 982970, upload-time = "2026-05-15T04:50:32.961Z" },
+    { url = "https://files.pythonhosted.org/packages/74/ed/6bb8d05b9f731f749fee5c6f5ca63e981143c826a5985877330507bd13b7/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3f277ebea5edd7b8bf03c6f9431e1d67d517530115572b2dc1d465326e8f88c7", size = 1115741, upload-time = "2026-05-15T04:50:34.475Z" },
+    { url = "https://files.pythonhosted.org/packages/34/de/2ca96b07a82d972b74fe4b46de055b79c904e45c7eab699354a0bfa697dc/tiktoken-0.13.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a116178fa7e1b4065bff05214360373a65cac22f965be7b3f73d00a0dbfe7649", size = 1136523, upload-time = "2026-05-15T04:50:35.782Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/dc/9dafec002c2d4424378563cf4cf5c7fb93631d2a55013c8b87554ee4012c/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2c397ddda233208345b01bd30f2fca79ff730e55731d0108a603f9bc57f6af3b", size = 1181954, upload-time = "2026-05-15T04:50:36.99Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/d0/1f8578c45b2f24759b46f0b50d31878c63c73e6bf0f2227e10ec5c5408dc/tiktoken-0.13.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95097e4f89b06403976e498abf61a0ee73a7497e73fb599cb211d8197a054d91", size = 1240069, upload-time = "2026-05-15T04:50:38.221Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/90/28d7f154888610aa9237e541986beb62b479df29d193a5a0617dbb1514d0/tiktoken-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:8f2d16e7a7c783ad81f36e457d046d1f1c8af70b22aec8a13238efe531977c41", size = 874748, upload-time = "2026-05-15T04:50:39.587Z" },
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.22.2"
@@ -4518,6 +6091,53 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" },
 ]
 
+[[package]]
+name = "torchaudio"
+version = "2.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/aa/7fce684dc0e21f8ea3ecf4a9f37253f8fa0b51aa0973202b58f33b9dc031/torchaudio-2.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:214d2e8bec2b204ac3f552f3dceae51550e06a91c5863d5dc341d81691ef655e", size = 806922, upload-time = "2025-10-15T15:51:53.069Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/c2/212181b1df762487462b3a092f6a9ae6ba87df02df71bb2121c100b13b8d/torchaudio-2.9.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1e84e45f74bf5b208b5ce59b36f26ec1e5f63596542c3ebee6edeadf85e73563", size = 473802, upload-time = "2025-10-15T15:51:55.626Z" },
+    { url = "https://files.pythonhosted.org/packages/39/27/75184741da9aa1e94ec136319781e1275a560d1c311a293cc22aba747863/torchaudio-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:905f2c916e392b6dde375c002abe98f6fc64705fdf1192c90a6df2de235305f3", size = 2055464, upload-time = "2025-10-15T15:51:57.996Z" },
+    { url = "https://files.pythonhosted.org/packages/43/af/f12349d7cb325b9b36452192953eb8c4ca9a6c28c8335c2d2f5e576be7f3/torchaudio-2.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:4ed556da9de16f69ccbe804df510ae8fefdf995cbdc2fcf26ea7532d25463326", size = 663878, upload-time = "2025-10-15T15:52:07.274Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/a2/7696b9579ad0c40b78ce2774fb24875c43257f3d0d24540e1cfa946c13b4/torchaudio-2.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:662eb49ab25e1a2b7367bb072a8ad05c8a4b650ebbe7090a5af1a1eb1d40767c", size = 808368, upload-time = "2025-10-15T15:51:56.56Z" },
+    { url = "https://files.pythonhosted.org/packages/55/1a/48d528cae6050b9a5f07c1c942b547143237e9f080f4a2ccb80ba88486df/torchaudio-2.9.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:914f1408142bdeda1ca9f834dd04967625fccc75893bd1504a018a13a04f1b66", size = 475720, upload-time = "2025-10-15T15:51:59.111Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/41/7aba77bc89d06df993c1519b66b7e0b09661d297d0eb8c044ab2c5af665f/torchaudio-2.9.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:86b15ce1d74814d5ca14bfac0d3b33f325c8cac4a6f09dcc5b82748133a96792", size = 2058688, upload-time = "2025-10-15T15:52:01.885Z" },
+    { url = "https://files.pythonhosted.org/packages/96/64/93944c24d7ec76dff3315f9aaf382e86d09fa2c865942c3d6b48666e5b1d/torchaudio-2.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:840487d748128ded45bd65b213b55db701ad047544e77ae3c57ea48f55623a77", size = 664692, upload-time = "2025-10-15T15:52:02.908Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/63/3c0ede3aa3d19a8a6698ddd107fa88660549360b51bf8ce2717cd498d800/torchaudio-2.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab4cbcccfd873b0fb41fcb39c9869e59ef84bb95b093f6f58e2d05172a7500d2", size = 809116, upload-time = "2025-10-15T15:52:00.911Z" },
+    { url = "https://files.pythonhosted.org/packages/be/d5/25e58745defe9d05893d3cba5c0e1a76aeaac503ac5ec4d9f83c871df71c/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7f93388b6e536c14d6015b6f75277a8b45efc532f61b35adc1ed06c98a86003e", size = 476020, upload-time = "2025-10-15T15:51:59.967Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/9c/58b8b49dfba2ae85e41ca86b0c52de45bbbea01987490de219c99c523a58/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:508318a2130b40ad51378f90caf8727a4bd3ac2b296f2b90c900b44e6068a940", size = 2059901, upload-time = "2025-10-15T15:51:54.634Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/eb/58b05f75d12f69ccc460893a20c999da082e063082120ed06e05cca3a053/torchaudio-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:82117e3a605f2959dc09b4cd8a11178d6e92727d5f85e5d4f9fe47502f84ee96", size = 665350, upload-time = "2025-10-15T15:52:08.384Z" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.24.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow", version = "12.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/5b/1404eeab00819df71a30e916c2081654366741f7838fcc4fff86b7bd9e7e/torchvision-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e8d5e667deff87bd66d26df6d225f46224bb0782d4f3f8f5d2f3068b5fd4492", size = 1891723, upload-time = "2025-10-15T15:51:08.5Z" },
+    { url = "https://files.pythonhosted.org/packages/88/e3/1b003ecd52bd721f8304aeb66691edfbc2002747ec83d36188ad6abab506/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a110a51c75e89807a8382b0d8034f5e180fb9319570be3389ffd3d4ac4fd57a9", size = 2418988, upload-time = "2025-10-15T15:51:25.195Z" },
+    { url = "https://files.pythonhosted.org/packages/56/2e/3c19a35e62da0f606baf8f6e2ceeab1eb66aaa2f84c6528538b06b416d54/torchvision-0.24.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:81d5b12a6df1bb2cc8bdbad837b637d6ea446f2866e6d94f1b5d478856331be3", size = 8046769, upload-time = "2025-10-15T15:51:15.221Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/1d/e7ab614a1ace820a2366eab1532679fbe81bd9501ffd6a1b7be14936366d/torchvision-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:0839dbb305d34671f5a64f558782095134b04bbeff8b90f11eb80515d7d50092", size = 3686529, upload-time = "2025-10-15T15:51:20.982Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/17/54ed2ec6944ea972b461a86424c8c7f98835982c90cbc45bf59bd962863a/torchvision-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f771cf918351ad509a28488be475f3e9cc71a750d6b1467842bfb64863a5e986", size = 1891719, upload-time = "2025-10-15T15:51:10.384Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/07/0cd6776eee784742ad3cb2bfd3295383d84cb2f9e87386119333d1587f0f/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbd63bf4ebff84c48c50123eba90526cc9f794fe45bc9f5dd07cec19e8c62bce", size = 2420513, upload-time = "2025-10-15T15:51:18.087Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/f4/6026c08011ddcefcbc14161c5aa9dce55c35c6b045e04ef0952e88bf4594/torchvision-0.24.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:78fe414b3bb6dbf7e6f6da6f733ba96881f6b29a9b997228de7c5f603e5ed940", size = 8048018, upload-time = "2025-10-15T15:51:13.579Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/b4/362b4e67ed87cee0fb4f8f0363a852eaeef527968bf62c07ed56f764d729/torchvision-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:629584b94e52f32a6278f2a35d85eeaae95fcc38730fcb765064f26c3c96df5d", size = 4027686, upload-time = "2025-10-15T15:51:19.189Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" },
+    { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/ad/3c56fcd2a0d6e8afa80e115b5ade4302232ec99655220a51d05709819523/torchvision-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:060b7c50ed4b3fb0316b08e2e31bfd874ec2f63ef5ae02f81e54341ca4e88703", size = 4292225, upload-time = "2025-10-15T15:51:27.699Z" },
+]
+
 [[package]]
 name = "tqdm"
 version = "4.67.1"
@@ -4681,11 +6301,68 @@ wheels = [
 
 [[package]]
 name = "urllib3"
-version = "2.6.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+version = "2.6.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1e/24/a2a2ed9addd907787d7aa0355ba36a6cadf1768b934c652ea78acbd59dcd/urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797", size = 432930, upload-time = "2025-12-11T15:56:40.252Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/b9/4095b668ea3678bf6a0af005527f39de12fb026516fb3df17495a733b7f8/urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd", size = 131182, upload-time = "2025-12-11T15:56:38.584Z" },
+]
+
+[[package]]
+name = "uuid-utils"
+version = "0.16.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/5a/5da7ae85b38e3eddba0be3e8e4328f90882fe92989728e6fb552963d4c42/uuid_utils-0.16.2.tar.gz", hash = "sha256:fa637e4f314ad5b59ff6d8e809d506443d68bef30bfaecdfcfe02cce689abb2f", size = 42962, upload-time = "2026-06-18T13:36:48.735Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/41/6f/783ba3792d91b89af608c120e40285cafed6dafd9354042f7a0ea32fca5b/uuid_utils-0.16.2-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:89a627f74cb55aa508809592ab9149806649e4ee37f4bc91b60c7ec10929f0eb", size = 568338, upload-time = "2026-06-18T13:34:46.259Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/39/2def0fb3b15604afacd95aee1f65fb60bfa33293792aaaec7d9722a8eeec/uuid_utils-0.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e92875a315f3cc4fe7a2324c17b3c7ac5e3fd0e24b14fc4deb28370431fe6a2b", size = 289111, upload-time = "2026-06-18T13:34:47.602Z" },
+    { url = "https://files.pythonhosted.org/packages/78/42/4d6202d9f7136c99fae003c73317928c1ca31d9bfae19f8d8f5d7791efad/uuid_utils-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:945e1819dde4cce6828683ce11311977e73e6d46c6cc18e5fb9fcab2051b94bb", size = 327875, upload-time = "2026-06-18T13:34:48.787Z" },
+    { url = "https://files.pythonhosted.org/packages/09/b3/843d002eafcfbc7a1812fd76d6d0633936b63b392ae728084a87e62feeea/uuid_utils-0.16.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2cd4612085e6bbf6a00b9890779023ea97fe1ee8dd1758381022f7588a06e123", size = 334553, upload-time = "2026-06-18T13:34:50.239Z" },
+    { url = "https://files.pythonhosted.org/packages/34/61/2ad178d48f7a81772bdf56f699d7b9f2727b38531b28313f730bf19a39a9/uuid_utils-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:887efa34701d197239ec3b0e89993ee8c0cea1746483b606e54746ea81c966f4", size = 450493, upload-time = "2026-06-18T13:34:51.395Z" },
+    { url = "https://files.pythonhosted.org/packages/30/17/8e8ae0f16eb4d183f43dbec12487795ad98ba875d30ffa4a97ce0b291a3e/uuid_utils-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22446af2ae47d1054562b159bcd65714a022713e56697eef77cb5f291dc5ef13", size = 327342, upload-time = "2026-06-18T13:34:52.735Z" },
+    { url = "https://files.pythonhosted.org/packages/40/3a/21b94228241cbf9f834ac4274b0cf9d9ea169f0bfa6f4b332b215a043352/uuid_utils-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:41165aa4059e3b03605c1c8c48df6c887a16f8f6a1fc4cb2155360a61aad8666", size = 354058, upload-time = "2026-06-18T13:34:53.998Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/45/6211782355f89ec540f5bc38a50e526fd4954a92d24d6b89e2adcdc31ded/uuid_utils-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2ef60e0a91675cfd9850e8aefd0d899fe09c4afb572bbe0ac2de4f8848d7663d", size = 504750, upload-time = "2026-06-18T13:34:55.279Z" },
+    { url = "https://files.pythonhosted.org/packages/df/aa/023bac4d6eb1148b7546bfdec17328b89ff6b32fb54ed7aaaaae61a96ff9/uuid_utils-0.16.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6fe3fb4bcecef69cacf3a11e182e204ce778998bd439152a173bdd2e9e8e9cfa", size = 610111, upload-time = "2026-06-18T13:34:56.549Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/7a/8fc55abdc1bf8e2fab855f323d6259d6ca64dde91bfbddad3e7f8c8d2537/uuid_utils-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fcc212dec731aeba110953643c214982e667cab9802f7d99d066e03ba0c44c90", size = 569768, upload-time = "2026-06-18T13:34:57.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/c2/26cbc525269fcea4cd2e8067ddff0cd7e74fe654414f1429d3d2279b6aac/uuid_utils-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3aa5c2ebc843e85a078ec27c1ad677871c44065b3dd58748166783a3c454859f", size = 533168, upload-time = "2026-06-18T13:34:59.101Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/c8/e015346ed76c9e5ea866ade786d328dee9d96241a905a190c408550586ba/uuid_utils-0.16.2-cp310-cp310-win32.whl", hash = "sha256:dcf20151d2aa451013f2b3c2cce06958f43b7573b5f616adb91786c7b777715b", size = 169892, upload-time = "2026-06-18T13:35:00.608Z" },
+    { url = "https://files.pythonhosted.org/packages/93/72/1fbd0db90f9fea4b0c54dc20dc2f26460f3f71d4273773d672bbff5c2fdf/uuid_utils-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:f709169579a356132f224d525ed589f88d466bbb922b9d752d8d86b1fb57ad46", size = 176414, upload-time = "2026-06-18T13:35:01.688Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/96/4023966d42fd9bbf9e2a8ce2b25930113688128b569f68bc4697cb18181d/uuid_utils-0.16.2-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fadd23eee409237fb8637a35796a6e108873c28b40f7de89a36685f18ca055ad", size = 567776, upload-time = "2026-06-18T13:35:02.902Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/30/764d2a76e8e7688abd5577e6024787c13692095eb1230fd1936f27205cd9/uuid_utils-0.16.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:79c5a3bd4301257b9a524efd16baf61ea65cd0d1b60b47d80f20b151fd65a09f", size = 288938, upload-time = "2026-06-18T13:35:04.285Z" },
+    { url = "https://files.pythonhosted.org/packages/45/ef/58077250fe04eda4a3f9fba8c35be8d0937b7d3e02302ac1a6d942b77dae/uuid_utils-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90903ab7fcdfb0300390c15f5a68cb91f15139d9a1a93f134c783d7a973fa269", size = 327387, upload-time = "2026-06-18T13:35:05.406Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/fd/a9172970fa07ce0b9148ccc679a99540375c7bb32f8fbd72cf1e6cca43ef/uuid_utils-0.16.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f7a44f8250ec178c0af703c3f1b6e81865a771272ae735ca403f27c95c62f132", size = 334212, upload-time = "2026-06-18T13:35:06.611Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/58/d8fb393b44ad0b719d96a5b7809d0ee727f7e266d9e88a4da235cbfbf9f8/uuid_utils-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e97ab941660f781a8e45f15aba9ee01b40dbb96adb5c43617c1671a4604b25b", size = 450379, upload-time = "2026-06-18T13:35:07.97Z" },
+    { url = "https://files.pythonhosted.org/packages/00/70/b3cf708e8942e6494742404a66f1586195a20c8fd235bdc79f385db383f1/uuid_utils-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a30b6a5790acb854e4b65fae7875e5d3c6f8076fa9c91dac43ff9e28380bc52", size = 327231, upload-time = "2026-06-18T13:35:09.327Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/9c/4c5b16e752a2402259a3a9d1371227025e5b85182024c82a446cbe3ed6ea/uuid_utils-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5dfc3e9e75139a84898771d31958ece6cdee8e8f127700aa8aa26a4f1a348d57", size = 353455, upload-time = "2026-06-18T13:35:10.67Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/0f/3b14c47fab1544bcfb92d28bc468160a4fc6ff342d0e6defa8ff40d5e4bd/uuid_utils-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0529b1ef0788f663e1211d221b59a38ec67f9b084f1ea5342ba84358b3d87e98", size = 504028, upload-time = "2026-06-18T13:35:12.006Z" },
+    { url = "https://files.pythonhosted.org/packages/39/79/1a133214626eb0e18c51ef196946b1263d65b578ffee432ad1b7afffa5d3/uuid_utils-0.16.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:cae08df8695f4b01fce2a8ab50e9e310971276d85dfc7103e977bed52d365094", size = 609803, upload-time = "2026-06-18T13:35:13.286Z" },
+    { url = "https://files.pythonhosted.org/packages/87/49/22bff932af63764b4cad9c01299ed64c60d101962988efc13964b4165345/uuid_utils-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f69658c42411540cf58be958a47e317fd2302cc0b613ea5cff1e60d87be2846d", size = 569512, upload-time = "2026-06-18T13:35:14.661Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/98/371cc1f332f7463b9cfac0a66f984af00f4e3ada4a196b20879e35404e8b/uuid_utils-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:503f020acc7dbeb39c47fa33cf2971cf5960fa11f8394513fac461762a90c556", size = 532855, upload-time = "2026-06-18T13:35:15.99Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/06/f7ef7f6dabf68021eb6e961c09d16d517ed7587cedfff18969ba7f61798c/uuid_utils-0.16.2-cp311-cp311-win32.whl", hash = "sha256:aab7cdf28a3e2859ca4f40a3e3bf53eb35895039c80d4d8d8c5e15b90346c55c", size = 169971, upload-time = "2026-06-18T13:35:17.294Z" },
+    { url = "https://files.pythonhosted.org/packages/75/8b/1e4b51c075eaadd23828b708249374db0bc40146f7b673027942d3383f45/uuid_utils-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:71192a59d473f3f638e2a238905046e2942006ad90ac5ec10d578e58ff9a08ce", size = 176464, upload-time = "2026-06-18T13:35:18.459Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/71/18a43b6e632adf3cb3cf5db777ea03f9d3b2b259de65de5e41419004c2a1/uuid_utils-0.16.2-cp311-cp311-win_arm64.whl", hash = "sha256:ea175649789f1e93edbf1a0440cab18c9838977703917221777691d8d988d7bf", size = 176056, upload-time = "2026-06-18T13:35:19.826Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/07/294b72a572218bf6e92355203b832b3356c58a7e1e0b92a034497d15bef9/uuid_utils-0.16.2-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:6f064dc54c6abecb09eb104d953bfb079f3c395e0d6b18899979f852d1083549", size = 560726, upload-time = "2026-06-18T13:35:21.053Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/3c/1095b6ab574a7fa69136d47bab5a43f320a8f00a0ecb96059fd49b1747b2/uuid_utils-0.16.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:dd7aa18db5cc826d482d876a826fee445839701f81f78567e7c74b4458d57a84", size = 288065, upload-time = "2026-06-18T13:35:22.547Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/9d/6404d48fe71def0733c9568d96043b2e1945e2e4205c4eb525db3da42ba3/uuid_utils-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc25ad320c9b44c2d3ed33aff4f85b0b277bef4ff79b12c01ee58b52ea44be1d", size = 322946, upload-time = "2026-06-18T13:35:23.648Z" },
+    { url = "https://files.pythonhosted.org/packages/74/00/8a009762015a134aa04b5451400e0ec9832ccd598ed4845f9aecb0be6299/uuid_utils-0.16.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d0ca752d51d1004caff65fccffd44b32a26cb099b546e0512cfa09facb683d6c", size = 330186, upload-time = "2026-06-18T13:35:24.757Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/b0/1613bb98ac11234145aa5bc1de618be536818fef05dec595efb3e2b37097/uuid_utils-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8323136bb02355c1b973492ab98b0722206dfdedfb148e4115c35fcdf3889bad", size = 444583, upload-time = "2026-06-18T13:35:25.999Z" },
+    { url = "https://files.pythonhosted.org/packages/93/66/83e62c7a152bbbb8b30ac58eaad81f3860ba2fba91a334c50f223f9ce878/uuid_utils-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bf8bfdffb22f620635580b17fd178272f30a9841b824b19b935c8db64bf09b6", size = 323064, upload-time = "2026-06-18T13:35:27.356Z" },
+    { url = "https://files.pythonhosted.org/packages/15/37/c1b2faaf3a9d7952f321a9fee3ad74e05b25878bd9b7cd6b0398fe77f279/uuid_utils-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:61454f2139424a6cff14eca7849c28b3350f261453b74075aa20fe99592dbb16", size = 347967, upload-time = "2026-06-18T13:35:28.538Z" },
+    { url = "https://files.pythonhosted.org/packages/24/d8/cdf79b242e41ae47b7cd617ac5d48f15ce44e81da8000379c757091ae5f8/uuid_utils-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:725110434a1d482a639a9ac467a24f1cb531d84ab52e454a13fe145b10b42cae", size = 499187, upload-time = "2026-06-18T13:35:30.042Z" },
+    { url = "https://files.pythonhosted.org/packages/be/10/978d5ad82bc0fe7ff02d5be6f1eb83b090849f0a95bf8438593565273b7a/uuid_utils-0.16.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:8197870739a3094990743a80f075fa0b17beafd6c187e5f360e021d90a12a6d1", size = 605696, upload-time = "2026-06-18T13:35:31.289Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/28/e382ee44a592e35b80397b493bf3fbbdb8e30a64eaaefc7dabc246aeb253/uuid_utils-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e10a02b3a31ed44c7c9a96abde335f5fa222735e73f3081d693414377eb3b016", size = 564975, upload-time = "2026-06-18T13:35:32.419Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/d0/f6011dbe4e5d751a8494715e014019cb5b242d8cd6dbec1cfec3d3fb2e81/uuid_utils-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fd32dbca0792b9683160151dc07fad11b915020eed7c82b43faf0862c2ff06a0", size = 528462, upload-time = "2026-06-18T13:35:33.685Z" },
+    { url = "https://files.pythonhosted.org/packages/42/7f/279e6159c37f43feb9dd70218b49a26696cefddaef1db7f4b79895eaf5d5/uuid_utils-0.16.2-cp312-cp312-win32.whl", hash = "sha256:dcdfcab60562d12dd43c1a6f495b1d089e41f0e10fac37d94db285d72b678c23", size = 167047, upload-time = "2026-06-18T13:35:34.862Z" },
+    { url = "https://files.pythonhosted.org/packages/47/38/f72f7bed062601448ec2db47351e6c1faccd78fd693bbc6e067299d1fa11/uuid_utils-0.16.2-cp312-cp312-win_amd64.whl", hash = "sha256:97ee6f5e803ea571f5f6da42efc97d8c5a13f121043680177f8470529b94e855", size = 173821, upload-time = "2026-06-18T13:35:36.117Z" },
+    { url = "https://files.pythonhosted.org/packages/37/61/8a025284a31c85b7c0c5319e96868c2c09dea3fc5f676c979a4cd4baf2e7/uuid_utils-0.16.2-cp312-cp312-win_arm64.whl", hash = "sha256:72cfd9ff1e8a7c371a044687e77eb873721c4a9f4814e453439bfba595b84303", size = 172206, upload-time = "2026-06-18T13:35:37.339Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/48/8c9fee7d75571f2f4b2386eac798fe5f826155d13797f7c86d45eb3fdc23/uuid_utils-0.16.2-pp311-pypy311_pp73-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8b8e325e61f918caf74ca540e3384b81e6e22aea782e57f615d15fc9773b96c8", size = 571003, upload-time = "2026-06-18T13:36:36.42Z" },
+    { url = "https://files.pythonhosted.org/packages/de/78/754eaaa49509be6fdb705de61d1e3889de32002132d5f00e8c1e5d212da3/uuid_utils-0.16.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9282677ebf2ea5b437c20d16e75bcd7629bdc205018f95557b33b76868d8bb5b", size = 290244, upload-time = "2026-06-18T13:36:38.066Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/e2/bfcbcf7eb9dfb17701104c569ed771eb359737bc70b7309e439610d089ef/uuid_utils-0.16.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e9ca7f5e215373cc9c147172170a0b1a4ab0dee9cc62fe446d9b075f31e3241", size = 328551, upload-time = "2026-06-18T13:36:39.605Z" },
+    { url = "https://files.pythonhosted.org/packages/72/bf/bbdbc39d1421953edcee0bad13a1893521a636eccf381580f53e530a4feb/uuid_utils-0.16.2-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:43cc72a92694d08ade8faadacf928857d9cceb84b449473246ae4e4f263d7d22", size = 335468, upload-time = "2026-06-18T13:36:41Z" },
+    { url = "https://files.pythonhosted.org/packages/04/2a/e8d4e6f1f2d2e567cf6e3202d125afe7da52ad7680bba048b106c09f01b9/uuid_utils-0.16.2-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:511b5fde12d29c37a9badd399af62105bb2f4696aa10eb18be74e7b9ca84413a", size = 450984, upload-time = "2026-06-18T13:36:42.635Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/da/ddb1dcf0fe9bfcb0dfcddec8ae52c8f95e7088e44719f58477f5fb2c5586/uuid_utils-0.16.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:585d3adf73afa60348bf2bd529491c640a692350e76d8ff3974455e273aadfe7", size = 327940, upload-time = "2026-06-18T13:36:44.138Z" },
+    { url = "https://files.pythonhosted.org/packages/37/fb/39305fbfffee1fdaccdb88fc0499ac9dcb7289a77ebc31938dcdd933cf95/uuid_utils-0.16.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ae5fa2007fd26d26f7b09e76259d5ca99bec191616207ca929f8dca12da08129", size = 355368, upload-time = "2026-06-18T13:36:45.682Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/70/b708edc3b776d7624b4354f43d443f14d951d3ac4d7d8867d94f2e59c3ae/uuid_utils-0.16.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:9b4520521aa46a2582fe1829c535fe60b78999b89257db998df3816eb895bdf3", size = 178221, upload-time = "2026-06-18T13:36:47.291Z" },
 ]
 
 [[package]]
@@ -4702,6 +6379,117 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3c/55/37407280931038a3f21fa0245d60edeaa76f18419581aa3f4397761c78df/uvicorn-0.31.1-py3-none-any.whl", hash = "sha256:adc42d9cac80cf3e51af97c1851648066841e7cfb6993a4ca8de29ac1548ed41", size = 63666, upload-time = "2024-10-09T19:44:18.734Z" },
 ]
 
+[package.optional-dependencies]
+standard = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "httptools" },
+    { name = "python-dotenv" },
+    { name = "pyyaml" },
+    { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" },
+    { name = "watchfiles" },
+    { name = "websockets" },
+]
+
+[[package]]
+name = "uvloop"
+version = "0.22.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/14/ecceb239b65adaaf7fde510aa8bd534075695d1e5f8dadfa32b5723d9cfb/uvloop-0.22.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ef6f0d4cc8a9fa1f6a910230cd53545d9a14479311e87e3cb225495952eb672c", size = 1343335, upload-time = "2025-10-16T22:16:11.43Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/ae/6f6f9af7f590b319c94532b9567409ba11f4fa71af1148cab1bf48a07048/uvloop-0.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7cd375a12b71d33d46af85a3343b35d98e8116134ba404bd657b3b1d15988792", size = 742903, upload-time = "2025-10-16T22:16:12.979Z" },
+    { url = "https://files.pythonhosted.org/packages/09/bd/3667151ad0702282a1f4d5d29288fce8a13c8b6858bf0978c219cd52b231/uvloop-0.22.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac33ed96229b7790eb729702751c0e93ac5bc3bcf52ae9eccbff30da09194b86", size = 3648499, upload-time = "2025-10-16T22:16:14.451Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/f6/21657bb3beb5f8c57ce8be3b83f653dd7933c2fd00545ed1b092d464799a/uvloop-0.22.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:481c990a7abe2c6f4fc3d98781cc9426ebd7f03a9aaa7eb03d3bfc68ac2a46bd", size = 3700133, upload-time = "2025-10-16T22:16:16.272Z" },
+    { url = "https://files.pythonhosted.org/packages/09/e0/604f61d004ded805f24974c87ddd8374ef675644f476f01f1df90e4cdf72/uvloop-0.22.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a592b043a47ad17911add5fbd087c76716d7c9ccc1d64ec9249ceafd735f03c2", size = 3512681, upload-time = "2025-10-16T22:16:18.07Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/ce/8491fd370b0230deb5eac69c7aae35b3be527e25a911c0acdffb922dc1cd/uvloop-0.22.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1489cf791aa7b6e8c8be1c5a080bae3a672791fcb4e9e12249b05862a2ca9cec", size = 3615261, upload-time = "2025-10-16T22:16:19.596Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/d5/69900f7883235562f1f50d8184bb7dd84a2fb61e9ec63f3782546fdbd057/uvloop-0.22.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c60ebcd36f7b240b30788554b6f0782454826a0ed765d8430652621b5de674b9", size = 1352420, upload-time = "2025-10-16T22:16:21.187Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/73/c4e271b3bce59724e291465cc936c37758886a4868787da0278b3b56b905/uvloop-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b7f102bf3cb1995cfeaee9321105e8f5da76fdb104cdad8986f85461a1b7b77", size = 748677, upload-time = "2025-10-16T22:16:22.558Z" },
+    { url = "https://files.pythonhosted.org/packages/86/94/9fb7fad2f824d25f8ecac0d70b94d0d48107ad5ece03769a9c543444f78a/uvloop-0.22.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53c85520781d84a4b8b230e24a5af5b0778efdb39142b424990ff1ef7c48ba21", size = 3753819, upload-time = "2025-10-16T22:16:23.903Z" },
+    { url = "https://files.pythonhosted.org/packages/74/4f/256aca690709e9b008b7108bc85fba619a2bc37c6d80743d18abad16ee09/uvloop-0.22.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56a2d1fae65fd82197cb8c53c367310b3eabe1bbb9fb5a04d28e3e3520e4f702", size = 3804529, upload-time = "2025-10-16T22:16:25.246Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/74/03c05ae4737e871923d21a76fe28b6aad57f5c03b6e6bfcfa5ad616013e4/uvloop-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40631b049d5972c6755b06d0bfe8233b1bd9a8a6392d9d1c45c10b6f9e9b2733", size = 3621267, upload-time = "2025-10-16T22:16:26.819Z" },
+    { url = "https://files.pythonhosted.org/packages/75/be/f8e590fe61d18b4a92070905497aec4c0e64ae1761498cad09023f3f4b3e/uvloop-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:535cc37b3a04f6cd2c1ef65fa1d370c9a35b6695df735fcff5427323f2cd5473", size = 3723105, upload-time = "2025-10-16T22:16:28.252Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/ff/7f72e8170be527b4977b033239a83a68d5c881cc4775fca255c677f7ac5d/uvloop-0.22.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fe94b4564e865d968414598eea1a6de60adba0c040ba4ed05ac1300de402cd42", size = 1359936, upload-time = "2025-10-16T22:16:29.436Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/c6/e5d433f88fd54d81ef4be58b2b7b0cea13c442454a1db703a1eea0db1a59/uvloop-0.22.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:51eb9bd88391483410daad430813d982010f9c9c89512321f5b60e2cddbdddd6", size = 752769, upload-time = "2025-10-16T22:16:30.493Z" },
+    { url = "https://files.pythonhosted.org/packages/24/68/a6ac446820273e71aa762fa21cdcc09861edd3536ff47c5cd3b7afb10eeb/uvloop-0.22.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:700e674a166ca5778255e0e1dc4e9d79ab2acc57b9171b79e65feba7184b3370", size = 4317413, upload-time = "2025-10-16T22:16:31.644Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/6f/e62b4dfc7ad6518e7eff2516f680d02a0f6eb62c0c212e152ca708a0085e/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b5b1ac819a3f946d3b2ee07f09149578ae76066d70b44df3fa990add49a82e4", size = 4426307, upload-time = "2025-10-16T22:16:32.917Z" },
+    { url = "https://files.pythonhosted.org/packages/90/60/97362554ac21e20e81bcef1150cb2a7e4ffdaf8ea1e5b2e8bf7a053caa18/uvloop-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e047cc068570bac9866237739607d1313b9253c3051ad84738cbb095be0537b2", size = 4131970, upload-time = "2025-10-16T22:16:34.015Z" },
+    { url = "https://files.pythonhosted.org/packages/99/39/6b3f7d234ba3964c428a6e40006340f53ba37993f46ed6e111c6e9141d18/uvloop-0.22.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:512fec6815e2dd45161054592441ef76c830eddaad55c8aa30952e6fe1ed07c0", size = 4296343, upload-time = "2025-10-16T22:16:35.149Z" },
+]
+
+[[package]]
+name = "vllm"
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "anthropic" },
+    { name = "blake3" },
+    { name = "cachetools" },
+    { name = "cbor2" },
+    { name = "cloudpickle" },
+    { name = "compressed-tensors" },
+    { name = "depyf" },
+    { name = "diskcache" },
+    { name = "einops" },
+    { name = "fastapi", extra = ["standard"] },
+    { name = "filelock" },
+    { name = "flashinfer-python" },
+    { name = "gguf" },
+    { name = "ijson" },
+    { name = "lark" },
+    { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+    { name = "lm-format-enforcer" },
+    { name = "mcp" },
+    { name = "mistral-common", extra = ["image"] },
+    { name = "model-hosting-container-standards" },
+    { name = "msgspec" },
+    { name = "ninja" },
+    { name = "numba" },
+    { name = "numpy" },
+    { name = "openai", version = "2.13.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "openai", version = "2.43.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "openai-harmony" },
+    { name = "opencv-python-headless" },
+    { name = "outlines-core" },
+    { name = "partial-json-parser" },
+    { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow", version = "12.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "prometheus-client" },
+    { name = "prometheus-fastapi-instrumentator" },
+    { name = "protobuf", version = "4.25.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "protobuf", version = "5.29.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "psutil" },
+    { name = "py-cpuinfo" },
+    { name = "pybase64" },
+    { name = "pydantic" },
+    { name = "python-json-logger" },
+    { name = "pyyaml" },
+    { name = "pyzmq" },
+    { name = "ray", extra = ["cgraph"] },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "scipy" },
+    { name = "sentencepiece" },
+    { name = "setproctitle" },
+    { name = "setuptools", marker = "python_full_version >= '3.12'" },
+    { name = "six", marker = "python_full_version >= '3.12'" },
+    { name = "tiktoken" },
+    { name = "tokenizers" },
+    { name = "torch" },
+    { name = "torchaudio" },
+    { name = "torchvision" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "typing-extensions" },
+    { name = "watchfiles" },
+    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/11/12/b922f96778d07df1c28dfa9a81fbc9706c13c5d0a4e8d154060818a79705/vllm-0.13.0.tar.gz", hash = "sha256:4ad43db45fef37114b550d03a4f423fb3fa3a31d8bc09ee810ef8b9cdcd4b5fe", size = 17828199, upload-time = "2025-12-19T03:30:32.741Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/82/e6194ac86862c50e9ff3f58ab3eb63d71604f96723bead2fcc610821197f/vllm-0.13.0-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:464b722c5c5d67a39593ada4a228f7558e860a732cb74a3bfa61c1b442b57581", size = 442031402, upload-time = "2025-12-19T03:31:07.026Z" },
+    { url = "https://files.pythonhosted.org/packages/46/ae/36f87f514811c1389ff1a16e4e5b0b55f25ce782eb0eff2d7eaa92ff7deb/vllm-0.13.0-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:12b3d0a3b91c32a0091349de64b464f1c3d499a5b3a5d0ec387fef94ed5df6ee", size = 474942618, upload-time = "2025-12-19T03:31:35.593Z" },
+]
+
 [[package]]
 name = "waitress"
 version = "3.0.2"
@@ -4723,6 +6511,62 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload-time = "2024-05-31T16:56:16.699Z" },
 ]
 
+[[package]]
+name = "watchfiles"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/41/5e1a4bb12aac5f1493fa1bdc11154eca3b258ca4eba65d39c473fe19d8e9/watchfiles-1.2.0.tar.gz", hash = "sha256:c995fba777f1ea992f090f9236e9284cf7a5d1a0130dd5a3d82c598cacd76838", size = 108252, upload-time = "2026-05-18T04:32:04.251Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/5a/2bf22ecb24916983bf1cc0095e7dea2741d14d6553b0d6a2ac8bc96eca93/watchfiles-1.2.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:bb68bf4df85abebe5efddc53cf2075520f243a59868d9b3973278b23e76962a9", size = 400471, upload-time = "2026-05-18T04:31:08.908Z" },
+    { url = "https://files.pythonhosted.org/packages/55/70/dea1f6a0e76607841a60fb51af150e70124864673f61704abb62b90cdcc7/watchfiles-1.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c16cb06dd17d43b9d185094268459eac92c9538356f050e55b54e82cf700e1d4", size = 394599, upload-time = "2026-05-18T04:30:19.845Z" },
+    { url = "https://files.pythonhosted.org/packages/18/52/752dcc7dc817baef5e89518732925795ce52e36a683a9a3c9fb68b21504e/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a0feab9af4c021c581f695258c642b3d10c5fd4c676e33a0d8606425d82631", size = 455458, upload-time = "2026-05-18T04:30:29.126Z" },
+    { url = "https://files.pythonhosted.org/packages/12/48/366ebbb22fcc504c2f72b45f0b7e72f40a18795cc01752c16066d597b67a/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a16ffe19bf5cf9f5edaa1ad1dd830c5a816e8feec430c522302ab55483a4b994", size = 460513, upload-time = "2026-05-18T04:31:40.85Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/44/1f9e1b15e7a729062e0d0c3d0d7225ea4ab98b2267ef87287153be2495fc/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:204f299afcbd65918ab78dbc52626b0ae45e9d8cef403fdbf33ecf9e40eac66e", size = 493616, upload-time = "2026-05-18T04:30:58.47Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/55/8b1086dcc8a1d6a697a62767bd7ea368e74c61c6fd171683cfe24a3fe5d2/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11743adfa510bfffebe97659fb280182b5c9b238708f667e866f308c3430dc19", size = 573154, upload-time = "2026-05-18T04:30:37.903Z" },
+    { url = "https://files.pythonhosted.org/packages/14/7a/242f400cc77fafa7b18d53d19d9cb64fc6a6f61f28c55913bae7c674d92a/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb72919d93e3a16fc451d3aa3d4b1698423daca1b382d3d959c9ac51297c12a8", size = 467046, upload-time = "2026-05-18T04:30:41.869Z" },
+    { url = "https://files.pythonhosted.org/packages/02/c8/79eee650c62d2c186598489814468e389b5def0ebe755399ff645b35b1b2/watchfiles-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62f042afde2dde21ec1d2c1a74361e804673df86f51e418a999c9acfe671b07", size = 457100, upload-time = "2026-05-18T04:31:13.064Z" },
+    { url = "https://files.pythonhosted.org/packages/81/36/519f6dbb7a95e4fe7c1513ed25b1520295ef9905a27f1f2226a73892bfb7/watchfiles-1.2.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:027ae72bfdfd254862065d8b3e2a815c6ab9b1853ce41e6648ece84afd34a551", size = 467038, upload-time = "2026-05-18T04:30:32.915Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/12/951af6b9f89097e02511122258402cb3578443021930b70cf968d6310dc0/watchfiles-1.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e1cfd51e97e13ff3bd047c140764d277fc9b95b7cb5da59e46a47d167adab310", size = 632563, upload-time = "2026-05-18T04:30:11.539Z" },
+    { url = "https://files.pythonhosted.org/packages/28/cc/0cba1f0a6117b7ec117271bdc3cb3a5a252005959755a2c09a745e0942cc/watchfiles-1.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:24b2405c0a46738dd9e1cf7135aa5dbdb9d42d024628651b3b13d5117e99f8df", size = 660851, upload-time = "2026-05-18T04:31:53.186Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/f2/26347558cc8bf6877845e66b315f644d03c173906aa09e233a3f4fd23928/watchfiles-1.2.0-cp310-cp310-win32.whl", hash = "sha256:8c520725602756229f045b032a1ff33d7ef0f7404189d62f6c2438cb6d8ef6a1", size = 277023, upload-time = "2026-05-18T04:30:18.825Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/68/a5e67b6b68e94f4c1511d61c46c55eba0737583620b6febf194c7b9cc23f/watchfiles-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:03b14855c6f35539e2d95c442ae9530a75762f1e26567152b9ed05f96534a74d", size = 290107, upload-time = "2026-05-18T04:32:09.677Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/3d/8024c801df84d1587740d0359e7fdd80afeae3d159011f3d5376dd82f18e/watchfiles-1.2.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:704fd259e332e01f9b9c178f4bce9e49027e5587cc2600eeeaf8e76e1c846201", size = 400242, upload-time = "2026-05-18T04:31:19.014Z" },
+    { url = "https://files.pythonhosted.org/packages/87/5b/f4dfd45323e949984a3a7f9dc31d1cbb049921e7d98253488dda72ccdaa9/watchfiles-1.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6543cf55d170003296d185c0af981f3e1311564907e1f4e08671fc7693a890a5", size = 394562, upload-time = "2026-05-18T04:30:08.46Z" },
+    { url = "https://files.pythonhosted.org/packages/98/d8/19483ef075d601c409bce8bcbb5c0f81a10876fff870400568f08ce484a1/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89d8c2394a065ca86f5d2910ff263ae67c127e1376ccc4f9fc35c71db879f80a", size = 456611, upload-time = "2026-05-18T04:30:45.723Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/6a/cc81fbe7ee42f2f22e661a6e12def7807e01b14b2f39e0ff83fd373fd307/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:772b80df316480d894a0e3165fdd19cf77f5d17f9a787f94029465ad0e3529d1", size = 461379, upload-time = "2026-05-18T04:31:29.292Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/57/7e669002082c0a0f4fb5113bb70125f7110124b846b0a11bc5ae8e90eac1/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d158cd89df6053823533e06fb1d73c549133bff5f0396170c0e53d9559340717", size = 493556, upload-time = "2026-05-18T04:30:05.44Z" },
+    { url = "https://files.pythonhosted.org/packages/45/7d/f60a2b19807b21fe8281f3a8da4f59eef0d5f96825ac4680ba2d4f2ebf91/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d516b3283a758e087841aedb8031549fb41ced08f3db10aa6d2bf32dc042525b", size = 575255, upload-time = "2026-05-18T04:30:40.568Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/49/77f5b5e6efbcd57482f74948ebb1b97e5c0046d6b61475042d830c84b3ff/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53b2290c92e0506d102cd448fbc610d87079553f86caa39d67440856a8b8bba5", size = 467052, upload-time = "2026-05-18T04:31:17.942Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/5a/73e2959af1b97fd5d556f9a8bdba017be23ceeef731869d5eaa0a753d5a3/watchfiles-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a711b51aec4370d0dcda5b6c09463206f133a5759341d7744b953a7b62e1100e", size = 456858, upload-time = "2026-05-18T04:30:30.182Z" },
+    { url = "https://files.pythonhosted.org/packages/50/57/1bc8c27fad7e6c19bddee15d276dbb6ab72480ec01c127afff1673aee417/watchfiles-1.2.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:e2ca07fa7d89195ec0865d3d285666286740bfa83d83e5cee204043a31ecc165", size = 467579, upload-time = "2026-05-18T04:32:15.897Z" },
+    { url = "https://files.pythonhosted.org/packages/09/6c/3c2e44edba3553c5e3c3b8c8a2a6dee6b9e12ae2cf4bd2378bebf9dc3038/watchfiles-1.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e0618518f282c4ebff60f5e5b1247b6d91bb8b9f4476947563a1e74acc66f3c6", size = 633253, upload-time = "2026-05-18T04:31:37.123Z" },
+    { url = "https://files.pythonhosted.org/packages/30/c2/d8c84a882ab39bbefcc4915ab3e91830b7a7e990c5570b0b69075aba3faf/watchfiles-1.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d191c054d0715c3c95c99df9b8dbf6fd096d8c1e021e8f212e1bd8bc444ccb5", size = 660713, upload-time = "2026-05-18T04:31:24.62Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/07/f97736a5fc605364fe67b25e9fa4a6965dfd4840d50c406ada507e9d735f/watchfiles-1.2.0-cp311-cp311-win32.whl", hash = "sha256:9342472aff9b093c5acd4f6d8f70ae0937964ab56542502bcf5579782da69ae8", size = 277222, upload-time = "2026-05-18T04:31:21.131Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/99/2b04981977fc2608afd60360d928c6aecf6b950292ca221d98f4005f6694/watchfiles-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:dbd6c97045dad81227c8d040173da044c1de08de64a5ea8b555da4aee1d5fa22", size = 290274, upload-time = "2026-05-18T04:31:45.966Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/74/f7f58a7075ee9cf612b0cfcddb78b8cd8234f0742d6f0075cf0da2dde1c6/watchfiles-1.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:57a2d9fa4fb4c2ecae57b13dfff2c7ab53e21a2ba674fe9f05506680fcdcc0d7", size = 283460, upload-time = "2026-05-18T04:31:39.126Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/2f/e42c992d2afda3108ea1c02acecc991b9f31d05c14adc2a7cee9ee211fc4/watchfiles-1.2.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:bc13eb17538be00c874699dc0abe4ee2bc8d50bb1166a6b9e175ef3fd7eb8f26", size = 400115, upload-time = "2026-05-18T04:32:02.06Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/8f/6af2ea19065c91d8b0ea3516fdfc8c0d349f407e8e9fbf4e5a17360de8ad/watchfiles-1.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2d95ddc1eb6914154253d239089900813f6a767e174b8e6a50e7fdacb7e4236c", size = 393659, upload-time = "2026-05-18T04:30:50.951Z" },
+    { url = "https://files.pythonhosted.org/packages/13/01/b32a967c56fb3e3e5be3db52c3d3b87fa4513aa367d8ed1ad96d42952e5f/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f70d8b291ef6e88d19b1f297a6905ddb978888d9272b0d05e6f53309856bcfc", size = 453207, upload-time = "2026-05-18T04:31:04.231Z" },
+    { url = "https://files.pythonhosted.org/packages/04/98/97557a812180338cb1abd32e1cffcc4588f59b5f23e0cb006b2ba95ba64a/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:56d8641cf834c2836922899105bd3ce3d0dfc69291d52edf0b4d0436829b34c0", size = 459273, upload-time = "2026-05-18T04:31:50.377Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/a8/b4b08dcb7653b8087c6586f7ce649505900e866bbcfe40dc9587af02e686/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2581a94056e55d7d0a31a823ea92bf73749c489ca2285bfdc0fbe6b2bb49d50c", size = 489927, upload-time = "2026-05-18T04:31:42.485Z" },
+    { url = "https://files.pythonhosted.org/packages/50/94/3dceea03545d2e5ddfd839f0ddd5e1cecbf1697b5a428d5ba11cef6af95d/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:41bc1199f7523b3f82843c88cbb979180c949caef0342cf90968f178e5d49b01", size = 570476, upload-time = "2026-05-18T04:31:03.071Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f2/d39a5450c3532092b91f81d274360e613c2371bc874a89c7a1a3c5e8d138/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7571e4464cb6e434958f867f7f730b8ab0b75e3f8e5eac0499168486ab3c33a8", size = 465650, upload-time = "2026-05-18T04:30:12.701Z" },
+    { url = "https://files.pythonhosted.org/packages/22/24/ed72f68cbc1333ca9b9f2200aa048bb6658ae41709bc1caad4310f4bdffd/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53a384f76b631c3ae5334ce6a52f0baa3a911eb94a4eac7f160079868b716d5", size = 456398, upload-time = "2026-05-18T04:30:13.784Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/64/982ef4a4e5bab5b6e5b6becc8cd5e732f6130a78b855f0abec6439a9a135/watchfiles-1.2.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:d20029a60a71a052a24c4db7673bc4de39ab89adbaccbfb5d67987c5d73f424d", size = 465140, upload-time = "2026-05-18T04:31:52.111Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/0c/95282abf4ed680b6096010bcfc30c5fa7a041fc5aa5a2ad17a2cc6c75bba/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2cb93af48550faf1cea04c303107c8b75833de7013e57ce27d3b8d21d8d0f58c", size = 630259, upload-time = "2026-05-18T04:31:25.676Z" },
+    { url = "https://files.pythonhosted.org/packages/30/45/607c1de1530c4bdcf2cf1d1ecc2505ddba5d96bd43ba9f2b0e79876f850f/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2995c176de7692b86a2e4c58d9ec718f753150a979cb4a754e2b4ffa38e70906", size = 659859, upload-time = "2026-05-18T04:30:24.333Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/08/d9e2e0f9e8e6791d33aefc694ad7eefa7f901f63caff84a81ded38692f9c/watchfiles-1.2.0-cp312-cp312-win32.whl", hash = "sha256:7a2cffd17d27d2ecbb310c2b1d8174f222a5495b1a721894afa88ec11e25b898", size = 275480, upload-time = "2026-05-18T04:30:31.307Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/e6/9d42569c0102645cc8cea5d8c7d8a1e9d4ada2cb7f05f75e554b8aa2202a/watchfiles-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:f155b3a1b2a5fc89cdc70d47ee5d54e3b75e88efa34982028a35daef9ba00379", size = 288718, upload-time = "2026-05-18T04:32:10.745Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/26/88e0dc6ee3898169d7fa22bb6a69cabf2502d2ee25cb8c876d1262d204f8/watchfiles-1.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:8fa585ede612ee9f9e91b18bebf9ba11b9ae29a4e3a0d0cf6fca3e382133f0d5", size = 281026, upload-time = "2026-05-18T04:30:22.23Z" },
+    { url = "https://files.pythonhosted.org/packages/23/f4/7513ef1e85fc4c6331b59479d6d72661fc391fbe543678052ac72c8b6c19/watchfiles-1.2.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:4674d49eb94706dfe666c069fc0a1b646ffcf920473492e209f6d5f60d3f0cc2", size = 403050, upload-time = "2026-05-18T04:30:36.753Z" },
+    { url = "https://files.pythonhosted.org/packages/27/0b/a54103cfd732bb703c7a749222011a0483ef3705948dae3b203158601119/watchfiles-1.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:094b9b70103d4e963499bdea001ee3c2697b144cd9ae6218a62c0f89ec9e31db", size = 396629, upload-time = "2026-05-18T04:32:03.268Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/2c/73f31a3b893886206c3f54d73e8ad8dee58cdb2f69ad2622e0a8a9e07f4e/watchfiles-1.2.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0ef001f8c25ad0fa9529f914c1600647ecd0f542d11c19b7894768c67b6acb7", size = 457318, upload-time = "2026-05-18T04:31:01.932Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/f9/45d021e4a5cc7b9dd567f7cbb06d3b75f751a690063fb6cc7ec60f4e46b7/watchfiles-1.2.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a88fc94e647bc4eec523f1caa540258eb71d14278b9daf72fa1e2658a98df0f0", size = 457771, upload-time = "2026-05-18T04:30:56.331Z" },
+]
+
 [[package]]
 name = "weasel"
 version = "0.4.3"
@@ -4856,6 +6700,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/d1/b51471c11592ff9c012bd3e2f7334a6ff2f42a7aed2caffcf0bdddc9cb89/wrapt-2.0.1-py3-none-any.whl", hash = "sha256:4d2ce1bf1a48c5277d7969259232b57645aae5686dba1eaeade39442277afbca", size = 44046, upload-time = "2025-11-07T00:45:32.116Z" },
 ]
 
+[[package]]
+name = "xgrammar"
+version = "0.1.27"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mlx-lm", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" },
+    { name = "ninja" },
+    { name = "numpy" },
+    { name = "pydantic" },
+    { name = "torch" },
+    { name = "transformers" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/62/e1/b522b1e50fddd773d368c2945ef5ed628aa90c0c972027f9aa5a51d6d4f9/xgrammar-0.1.27.tar.gz", hash = "sha256:40af7bb2891f1633ec7f660723c74a92a963307d283aca9e3b4e53a0feaf1d46", size = 2303435, upload-time = "2025-11-04T03:11:53.512Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/55/03a548a116fa5cc716ce70e15240ca61ddaae046ed34c711e63d3d91d047/xgrammar-0.1.27-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:6e3ea7cd74a7d4188744f90878507637ce9ac5f671cb9d1bda9d53305a46889e", size = 664256, upload-time = "2025-11-04T03:11:08.471Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a5/45a430a7fb44f70303742c59e7d792ca6d4b7960e9252ec5238f1112bbcd/xgrammar-0.1.27-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:73ca9ec86e81a7f936c5668b7e6dda6929c078d1748b7615c8da504584b6c24a", size = 637358, upload-time = "2025-11-04T03:11:10.975Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/2b/3867379f76b97fb7cb03bc0fa1d0f81f19d13df3c313bd22878b636b0f50/xgrammar-0.1.27-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa8b7cc167737a9b4c3e1012faa7b488cc5b451ea8403c4d77ec1d53b58e9266", size = 8687578, upload-time = "2025-11-04T03:11:13.304Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/35/fe718ec90c210ab892a845af6d4e6e876a3d3c7dcc1bacaa98abfec42c0f/xgrammar-0.1.27-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e0c5899b59c8e45ba3a6f3b9e7fb2ef23243f09b164f724d59c7734173bb3db", size = 8869161, upload-time = "2025-11-04T03:11:15.572Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/44/674724714407e0265d088ad40a17d367c00d72d206f2b15d559a644a36dc/xgrammar-0.1.27-cp310-cp310-win_amd64.whl", hash = "sha256:1ce2558992b0ffda65f46772bae94b051d139f0036968853078904bc167d4a8d", size = 709212, upload-time = "2025-11-04T03:11:17.572Z" },
+    { url = "https://files.pythonhosted.org/packages/93/bb/e6d30457c99a0ce11247154ecb1f3f9fab5960192a0564c2862ba9b98897/xgrammar-0.1.27-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c995c71ea94b153eac0e08c36eb82a898d7d71e4b77ce93f3b9fe648bd2d3a04", size = 664112, upload-time = "2025-11-04T03:11:18.932Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/81/caab5c46d314c1b005e36c9ec8aef124f7c52619d980f2bbd2d4cf4cd491/xgrammar-0.1.27-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:456f2f74135a414f44413599d90a382f5b22e6b515e4ae7e8938a28f7efacbaa", size = 637181, upload-time = "2025-11-04T03:11:20.29Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/29/7f78ed69b5f221206af0b68b0517335f9c09459def5d63065827a79fec74/xgrammar-0.1.27-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed23e6960218e791ecaccbbbb66d7caa5c0ed8636aca85807d81b89ba87a7f33", size = 8674617, upload-time = "2025-11-04T03:11:22.255Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/a2/afcce6a59b83644ffe19ffebe8107355febb15d8084ce5316eccd457e3c8/xgrammar-0.1.27-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02fe3b137d041649b8f7a180a0aa7f3466d47579ce4e9fbdb77208b59621b2ab", size = 8869958, upload-time = "2025-11-04T03:11:24.751Z" },
+    { url = "https://files.pythonhosted.org/packages/76/fb/a4a3254041174013ff09e99c298f2bc6c03f34891df458839de7cbb53e4b/xgrammar-0.1.27-cp311-cp311-win_amd64.whl", hash = "sha256:db0c74f7cc4fb2b5d566eee873e4d18920ed5ee0fe500178b412408d0dad3686", size = 709137, upload-time = "2025-11-04T03:11:26.672Z" },
+    { url = "https://files.pythonhosted.org/packages/39/b6/09b43e2adff45d30ebcf9110d0ff753f4c96b368adaa2d166df3dee88d5f/xgrammar-0.1.27-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:6404a7714440eb86ab0379d749f33591274eeef04787dc00d61f22069f3ed51d", size = 663319, upload-time = "2025-11-04T03:11:28.682Z" },
+    { url = "https://files.pythonhosted.org/packages/88/8b/53eb5c6d0df8df9f6350f182516a5b8c7b8b11d62650300d2c04af2bc4ea/xgrammar-0.1.27-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d01fa9894bc44a7f6a70b0301b59f3e310c0e0e7b7ea4cf5ce190b12d8220dd8", size = 636168, upload-time = "2025-11-04T03:11:30.373Z" },
+    { url = "https://files.pythonhosted.org/packages/08/1b/53d30395bb973f13255d3e3a72961f95fdfb4083877c3f93bb626e3d1522/xgrammar-0.1.27-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:906c0601bac9170e1bab77ca985259035ff9c386c347efcb191555eab86e984e", size = 8676340, upload-time = "2025-11-04T03:11:32.203Z" },
+    { url = "https://files.pythonhosted.org/packages/48/74/70cfac0171d9f309cfe18c5384330e3edc9466c436b258495fd30ecf29a3/xgrammar-0.1.27-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68988a122f544301c496f2cac8ee82960ca7f5b3a42a952b2a00c0a55e6ca5", size = 8870650, upload-time = "2025-11-04T03:11:34.322Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/a1/0392aa9c7669c56f7f88e4423b246476a74a72c3bb9db944e1bfc029985e/xgrammar-0.1.27-cp312-cp312-win_amd64.whl", hash = "sha256:3aac335ea052afc8f8dc34b9f2afcb9462a68189423aed9f60b0941db6cfc310", size = 708811, upload-time = "2025-11-04T03:11:36.214Z" },
+]
+
 [[package]]
 name = "xxhash"
 version = "3.6.0"
@@ -5019,3 +6896,61 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/96/fe/7d0b5c0692b283901b34847f2b2f50d805bfff4b31de4021ac9dfb516d2a/zope_interface-8.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eee6f93b2512ec9466cf30c37548fd3ed7bc4436ab29cd5943d7a0b561f14f0f", size = 264281, upload-time = "2025-11-15T08:36:58.968Z" },
     { url = "https://files.pythonhosted.org/packages/2b/2c/a7cebede1cf2757be158bcb151fe533fa951038cfc5007c7597f9f86804b/zope_interface-8.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:80edee6116d569883c58ff8efcecac3b737733d646802036dc337aa839a5f06b", size = 212327, upload-time = "2025-11-15T08:37:00.4Z" },
 ]
+
+[[package]]
+name = "zstandard"
+version = "0.25.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/7a/28efd1d371f1acd037ac64ed1c5e2b41514a6cc937dd6ab6a13ab9f0702f/zstandard-0.25.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e59fdc271772f6686e01e1b3b74537259800f57e24280be3f29c8a0deb1904dd", size = 795256, upload-time = "2025-09-14T22:15:56.415Z" },
+    { url = "https://files.pythonhosted.org/packages/96/34/ef34ef77f1ee38fc8e4f9775217a613b452916e633c4f1d98f31db52c4a5/zstandard-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4d441506e9b372386a5271c64125f72d5df6d2a8e8a2a45a0ae09b03cb781ef7", size = 640565, upload-time = "2025-09-14T22:15:58.177Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1b/4fdb2c12eb58f31f28c4d28e8dc36611dd7205df8452e63f52fb6261d13e/zstandard-0.25.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:ab85470ab54c2cb96e176f40342d9ed41e58ca5733be6a893b730e7af9c40550", size = 5345306, upload-time = "2025-09-14T22:16:00.165Z" },
+    { url = "https://files.pythonhosted.org/packages/73/28/a44bdece01bca027b079f0e00be3b6bd89a4df180071da59a3dd7381665b/zstandard-0.25.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e05ab82ea7753354bb054b92e2f288afb750e6b439ff6ca78af52939ebbc476d", size = 5055561, upload-time = "2025-09-14T22:16:02.22Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/74/68341185a4f32b274e0fc3410d5ad0750497e1acc20bd0f5b5f64ce17785/zstandard-0.25.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:78228d8a6a1c177a96b94f7e2e8d012c55f9c760761980da16ae7546a15a8e9b", size = 5402214, upload-time = "2025-09-14T22:16:04.109Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/67/f92e64e748fd6aaffe01e2b75a083c0c4fd27abe1c8747fee4555fcee7dd/zstandard-0.25.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:2b6bd67528ee8b5c5f10255735abc21aa106931f0dbaf297c7be0c886353c3d0", size = 5449703, upload-time = "2025-09-14T22:16:06.312Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/e5/6d36f92a197c3c17729a2125e29c169f460538a7d939a27eaaa6dcfcba8e/zstandard-0.25.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4b6d83057e713ff235a12e73916b6d356e3084fd3d14ced499d84240f3eecee0", size = 5556583, upload-time = "2025-09-14T22:16:08.457Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/83/41939e60d8d7ebfe2b747be022d0806953799140a702b90ffe214d557638/zstandard-0.25.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9174f4ed06f790a6869b41cba05b43eeb9a35f8993c4422ab853b705e8112bbd", size = 5045332, upload-time = "2025-09-14T22:16:10.444Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/87/d3ee185e3d1aa0133399893697ae91f221fda79deb61adbe998a7235c43f/zstandard-0.25.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25f8f3cd45087d089aef5ba3848cd9efe3ad41163d3400862fb42f81a3a46701", size = 5572283, upload-time = "2025-09-14T22:16:12.128Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/1d/58635ae6104df96671076ac7d4ae7816838ce7debd94aecf83e30b7121b0/zstandard-0.25.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3756b3e9da9b83da1796f8809dd57cb024f838b9eeafde28f3cb472012797ac1", size = 4959754, upload-time = "2025-09-14T22:16:14.225Z" },
+    { url = "https://files.pythonhosted.org/packages/75/d6/57e9cb0a9983e9a229dd8fd2e6e96593ef2aa82a3907188436f22b111ccd/zstandard-0.25.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:81dad8d145d8fd981b2962b686b2241d3a1ea07733e76a2f15435dfb7fb60150", size = 5266477, upload-time = "2025-09-14T22:16:16.343Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/a9/ee891e5edf33a6ebce0a028726f0bbd8567effe20fe3d5808c42323e8542/zstandard-0.25.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a5a419712cf88862a45a23def0ae063686db3d324cec7edbe40509d1a79a0aab", size = 5440914, upload-time = "2025-09-14T22:16:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/58/08/a8522c28c08031a9521f27abc6f78dbdee7312a7463dd2cfc658b813323b/zstandard-0.25.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e7360eae90809efd19b886e59a09dad07da4ca9ba096752e61a2e03c8aca188e", size = 5819847, upload-time = "2025-09-14T22:16:20.559Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/11/4c91411805c3f7b6f31c60e78ce347ca48f6f16d552fc659af6ec3b73202/zstandard-0.25.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:75ffc32a569fb049499e63ce68c743155477610532da1eb38e7f24bf7cd29e74", size = 5363131, upload-time = "2025-09-14T22:16:22.206Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/d6/8c4bd38a3b24c4c7676a7a3d8de85d6ee7a983602a734b9f9cdefb04a5d6/zstandard-0.25.0-cp310-cp310-win32.whl", hash = "sha256:106281ae350e494f4ac8a80470e66d1fe27e497052c8d9c3b95dc4cf1ade81aa", size = 436469, upload-time = "2025-09-14T22:16:25.002Z" },
+    { url = "https://files.pythonhosted.org/packages/93/90/96d50ad417a8ace5f841b3228e93d1bb13e6ad356737f42e2dde30d8bd68/zstandard-0.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea9d54cc3d8064260114a0bbf3479fc4a98b21dffc89b3459edd506b69262f6e", size = 506100, upload-time = "2025-09-14T22:16:23.569Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" },
+    { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" },
+    { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" },
+    { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" },
+    { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" },
+    { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" },
+    { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" },
+    { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" },
+    { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" },
+    { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" },
+    { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" },
+    { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" },
+    { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" },
+    { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" },
+]