diff --git a/src/graphn/_generated/api/custom_models/get_supported_architectures.py b/src/graphn/_generated/api/custom_models/get_supported_architectures.py new file mode 100644 index 0000000..3d05d62 --- /dev/null +++ b/src/graphn/_generated/api/custom_models/get_supported_architectures.py @@ -0,0 +1,202 @@ +from http import HTTPStatus +from typing import Any +from urllib.parse import quote + +import httpx + +from ... import errors +from ...client import AuthenticatedClient, Client +from ...models.error import Error +from ...models.supported_architectures import SupportedArchitectures +from ...types import Response + + +def _get_kwargs( + workspace_id: str, +) -> dict[str, Any]: + + _kwargs: dict[str, Any] = { + "method": "get", + "url": "/v1/{workspace_id}/custom-models/supported-architectures".format( + workspace_id=quote(str(workspace_id), safe=""), + ), + } + + return _kwargs + + +def _parse_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> Error | SupportedArchitectures | None: + if response.status_code == 200: + response_200 = SupportedArchitectures.from_dict(response.json()) + + return response_200 + + if response.status_code == 401: + response_401 = Error.from_dict(response.json()) + + return response_401 + + if response.status_code == 403: + response_403 = Error.from_dict(response.json()) + + return response_403 + + if client.raise_on_unexpected_status: + raise errors.UnexpectedStatus(response.status_code, response.content) + else: + return None + + +def _build_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> Response[Error | SupportedArchitectures]: + return Response( + status_code=HTTPStatus(response.status_code), + content=response.content, + headers=response.headers, + parsed=_parse_response(client=client, response=response), + ) + + +def sync_detailed( + workspace_id: str, + *, + client: AuthenticatedClient | Client, +) -> Response[Error | SupportedArchitectures]: + """List model architectures supported for custom-model import + + Returns the static list of HuggingFace model architectures the + platform's serving runtimes can deploy, alongside the capability + tags (`tool_calling`, `vision`, etc.) each architecture exposes. + Use this to drive the UI's architecture/capability filters before + calling `validateCustomModel`. + + The list is updated alongside platform runtime upgrades; clients + should not cache it for more than a build cycle. + + Args: + workspace_id (str): + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[Error | SupportedArchitectures] + """ + + kwargs = _get_kwargs( + workspace_id=workspace_id, + ) + + response = client.get_httpx_client().request( + **kwargs, + ) + + return _build_response(client=client, response=response) + + +def sync( + workspace_id: str, + *, + client: AuthenticatedClient | Client, +) -> Error | SupportedArchitectures | None: + """List model architectures supported for custom-model import + + Returns the static list of HuggingFace model architectures the + platform's serving runtimes can deploy, alongside the capability + tags (`tool_calling`, `vision`, etc.) each architecture exposes. + Use this to drive the UI's architecture/capability filters before + calling `validateCustomModel`. + + The list is updated alongside platform runtime upgrades; clients + should not cache it for more than a build cycle. + + Args: + workspace_id (str): + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Error | SupportedArchitectures + """ + + return sync_detailed( + workspace_id=workspace_id, + client=client, + ).parsed + + +async def asyncio_detailed( + workspace_id: str, + *, + client: AuthenticatedClient | Client, +) -> Response[Error | SupportedArchitectures]: + """List model architectures supported for custom-model import + + Returns the static list of HuggingFace model architectures the + platform's serving runtimes can deploy, alongside the capability + tags (`tool_calling`, `vision`, etc.) each architecture exposes. + Use this to drive the UI's architecture/capability filters before + calling `validateCustomModel`. + + The list is updated alongside platform runtime upgrades; clients + should not cache it for more than a build cycle. + + Args: + workspace_id (str): + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[Error | SupportedArchitectures] + """ + + kwargs = _get_kwargs( + workspace_id=workspace_id, + ) + + response = await client.get_async_httpx_client().request(**kwargs) + + return _build_response(client=client, response=response) + + +async def asyncio( + workspace_id: str, + *, + client: AuthenticatedClient | Client, +) -> Error | SupportedArchitectures | None: + """List model architectures supported for custom-model import + + Returns the static list of HuggingFace model architectures the + platform's serving runtimes can deploy, alongside the capability + tags (`tool_calling`, `vision`, etc.) each architecture exposes. + Use this to drive the UI's architecture/capability filters before + calling `validateCustomModel`. + + The list is updated alongside platform runtime upgrades; clients + should not cache it for more than a build cycle. + + Args: + workspace_id (str): + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Error | SupportedArchitectures + """ + + return ( + await asyncio_detailed( + workspace_id=workspace_id, + client=client, + ) + ).parsed diff --git a/src/graphn/_generated/api/custom_models/update_custom_model.py b/src/graphn/_generated/api/custom_models/update_custom_model.py new file mode 100644 index 0000000..d56a354 --- /dev/null +++ b/src/graphn/_generated/api/custom_models/update_custom_model.py @@ -0,0 +1,311 @@ +from http import HTTPStatus +from typing import Any +from urllib.parse import quote + +import httpx + +from ... import errors +from ...client import AuthenticatedClient, Client +from ...models.custom_model import CustomModel +from ...models.custom_model_update import CustomModelUpdate +from ...models.error import Error +from ...types import Response + + +def _get_kwargs( + workspace_id: str, + model_id: str, + *, + body: CustomModelUpdate, +) -> dict[str, Any]: + headers: dict[str, Any] = {} + + _kwargs: dict[str, Any] = { + "method": "patch", + "url": "/v1/{workspace_id}/custom-models/{model_id}".format( + workspace_id=quote(str(workspace_id), safe=""), + model_id=quote(str(model_id), safe=""), + ), + } + + _kwargs["json"] = body.to_dict() + + headers["Content-Type"] = "application/json" + + _kwargs["headers"] = headers + return _kwargs + + +def _parse_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> CustomModel | Error | None: + if response.status_code == 200: + response_200 = CustomModel.from_dict(response.json()) + + return response_200 + + if response.status_code == 400: + response_400 = Error.from_dict(response.json()) + + return response_400 + + if response.status_code == 401: + response_401 = Error.from_dict(response.json()) + + return response_401 + + if response.status_code == 403: + response_403 = Error.from_dict(response.json()) + + return response_403 + + if response.status_code == 404: + response_404 = Error.from_dict(response.json()) + + return response_404 + + if client.raise_on_unexpected_status: + raise errors.UnexpectedStatus(response.status_code, response.content) + else: + return None + + +def _build_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> Response[CustomModel | Error]: + return Response( + status_code=HTTPStatus(response.status_code), + content=response.content, + headers=response.headers, + parsed=_parse_response(client=client, response=response), + ) + + +def sync_detailed( + workspace_id: str, + model_id: str, + *, + client: AuthenticatedClient | Client, + body: CustomModelUpdate, +) -> Response[CustomModel | Error]: + """Update mutable fields on a custom model + + Update the model record (cp-side) and propagate the change to the + live deployment when applicable. Only a small set of fields are + mutable post-create; immutable fields (`huggingface_model_id`, + `weight_source`, GPU topology, …) are intentionally not exposed + here — change them by deleting and re-creating the model. + + Today the only fields that take effect immediately on the live + deployment are `min_replicas` and `max_replicas`: cp updates the + DynamoDB record, then issues an AF-side in-place patch of the + underlying InferenceService's predictor scale bounds (no rolling + restart, no downtime). KServe propagates `max_replicas` through + to the KEDA `ScaledObject`'s `maxReplicaCount` on its next + reconcile. + + Other fields update the model record only and take effect on the + next deployment. + + Args: + workspace_id (str): + model_id (str): + body (CustomModelUpdate): Partial-update payload for `PATCH /v1/{workspaceId}/custom- + models/{modelId}`. + All fields are independently optional; omitted fields are left + unchanged. At least one field MUST be supplied. + + Only a small, vetted set of fields are mutable post-create. + Immutable fields (`huggingface_model_id`, `weight_source`, + GPU topology, …) are not exposed here — change them by + deleting and re-creating the model. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[CustomModel | Error] + """ + + kwargs = _get_kwargs( + workspace_id=workspace_id, + model_id=model_id, + body=body, + ) + + response = client.get_httpx_client().request( + **kwargs, + ) + + return _build_response(client=client, response=response) + + +def sync( + workspace_id: str, + model_id: str, + *, + client: AuthenticatedClient | Client, + body: CustomModelUpdate, +) -> CustomModel | Error | None: + """Update mutable fields on a custom model + + Update the model record (cp-side) and propagate the change to the + live deployment when applicable. Only a small set of fields are + mutable post-create; immutable fields (`huggingface_model_id`, + `weight_source`, GPU topology, …) are intentionally not exposed + here — change them by deleting and re-creating the model. + + Today the only fields that take effect immediately on the live + deployment are `min_replicas` and `max_replicas`: cp updates the + DynamoDB record, then issues an AF-side in-place patch of the + underlying InferenceService's predictor scale bounds (no rolling + restart, no downtime). KServe propagates `max_replicas` through + to the KEDA `ScaledObject`'s `maxReplicaCount` on its next + reconcile. + + Other fields update the model record only and take effect on the + next deployment. + + Args: + workspace_id (str): + model_id (str): + body (CustomModelUpdate): Partial-update payload for `PATCH /v1/{workspaceId}/custom- + models/{modelId}`. + All fields are independently optional; omitted fields are left + unchanged. At least one field MUST be supplied. + + Only a small, vetted set of fields are mutable post-create. + Immutable fields (`huggingface_model_id`, `weight_source`, + GPU topology, …) are not exposed here — change them by + deleting and re-creating the model. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + CustomModel | Error + """ + + return sync_detailed( + workspace_id=workspace_id, + model_id=model_id, + client=client, + body=body, + ).parsed + + +async def asyncio_detailed( + workspace_id: str, + model_id: str, + *, + client: AuthenticatedClient | Client, + body: CustomModelUpdate, +) -> Response[CustomModel | Error]: + """Update mutable fields on a custom model + + Update the model record (cp-side) and propagate the change to the + live deployment when applicable. Only a small set of fields are + mutable post-create; immutable fields (`huggingface_model_id`, + `weight_source`, GPU topology, …) are intentionally not exposed + here — change them by deleting and re-creating the model. + + Today the only fields that take effect immediately on the live + deployment are `min_replicas` and `max_replicas`: cp updates the + DynamoDB record, then issues an AF-side in-place patch of the + underlying InferenceService's predictor scale bounds (no rolling + restart, no downtime). KServe propagates `max_replicas` through + to the KEDA `ScaledObject`'s `maxReplicaCount` on its next + reconcile. + + Other fields update the model record only and take effect on the + next deployment. + + Args: + workspace_id (str): + model_id (str): + body (CustomModelUpdate): Partial-update payload for `PATCH /v1/{workspaceId}/custom- + models/{modelId}`. + All fields are independently optional; omitted fields are left + unchanged. At least one field MUST be supplied. + + Only a small, vetted set of fields are mutable post-create. + Immutable fields (`huggingface_model_id`, `weight_source`, + GPU topology, …) are not exposed here — change them by + deleting and re-creating the model. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[CustomModel | Error] + """ + + kwargs = _get_kwargs( + workspace_id=workspace_id, + model_id=model_id, + body=body, + ) + + response = await client.get_async_httpx_client().request(**kwargs) + + return _build_response(client=client, response=response) + + +async def asyncio( + workspace_id: str, + model_id: str, + *, + client: AuthenticatedClient | Client, + body: CustomModelUpdate, +) -> CustomModel | Error | None: + """Update mutable fields on a custom model + + Update the model record (cp-side) and propagate the change to the + live deployment when applicable. Only a small set of fields are + mutable post-create; immutable fields (`huggingface_model_id`, + `weight_source`, GPU topology, …) are intentionally not exposed + here — change them by deleting and re-creating the model. + + Today the only fields that take effect immediately on the live + deployment are `min_replicas` and `max_replicas`: cp updates the + DynamoDB record, then issues an AF-side in-place patch of the + underlying InferenceService's predictor scale bounds (no rolling + restart, no downtime). KServe propagates `max_replicas` through + to the KEDA `ScaledObject`'s `maxReplicaCount` on its next + reconcile. + + Other fields update the model record only and take effect on the + next deployment. + + Args: + workspace_id (str): + model_id (str): + body (CustomModelUpdate): Partial-update payload for `PATCH /v1/{workspaceId}/custom- + models/{modelId}`. + All fields are independently optional; omitted fields are left + unchanged. At least one field MUST be supplied. + + Only a small, vetted set of fields are mutable post-create. + Immutable fields (`huggingface_model_id`, `weight_source`, + GPU topology, …) are not exposed here — change them by + deleting and re-creating the model. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + CustomModel | Error + """ + + return ( + await asyncio_detailed( + workspace_id=workspace_id, + model_id=model_id, + client=client, + body=body, + ) + ).parsed diff --git a/src/graphn/_generated/models/__init__.py b/src/graphn/_generated/models/__init__.py index 8629c74..8bdfe18 100644 --- a/src/graphn/_generated/models/__init__.py +++ b/src/graphn/_generated/models/__init__.py @@ -1,5 +1,6 @@ """Contains all the data models used in inputs/outputs""" +from .architecture_info import ArchitectureInfo from .capability import Capability from .chat_completion_request import ChatCompletionRequest from .chat_completion_request_response_format import ChatCompletionRequestResponseFormat @@ -16,11 +17,13 @@ from .chat_message_tool_calls_item import ChatMessageToolCallsItem from .custom_model import CustomModel from .custom_model_access import CustomModelAccess +from .custom_model_artifact_type import CustomModelArtifactType from .custom_model_create import CustomModelCreate from .custom_model_create_quantization import CustomModelCreateQuantization from .custom_model_list import CustomModelList from .custom_model_quantization import CustomModelQuantization from .custom_model_status import CustomModelStatus +from .custom_model_update import CustomModelUpdate from .discover_imported_models_request import DiscoverImportedModelsRequest from .discover_imported_models_response import DiscoverImportedModelsResponse from .discovered_imported_model import DiscoveredImportedModel @@ -38,6 +41,7 @@ from .secret_create import SecretCreate from .secret_list import SecretList from .secret_update import SecretUpdate +from .supported_architectures import SupportedArchitectures from .test_connection_request import TestConnectionRequest from .test_connection_response import TestConnectionResponse from .test_connection_response_usage import TestConnectionResponseUsage @@ -46,9 +50,11 @@ from .validate_model_request import ValidateModelRequest from .validate_model_request_quantization import ValidateModelRequestQuantization from .validate_model_response import ValidateModelResponse +from .validate_model_response_artifact_type import ValidateModelResponseArtifactType from .weight_source import WeightSource __all__ = ( + "ArchitectureInfo", "Capability", "ChatCompletionRequest", "ChatCompletionRequestResponseFormat", @@ -63,11 +69,13 @@ "ChatMessageToolCallsItem", "CustomModel", "CustomModelAccess", + "CustomModelArtifactType", "CustomModelCreate", "CustomModelCreateQuantization", "CustomModelList", "CustomModelQuantization", "CustomModelStatus", + "CustomModelUpdate", "DiscoveredImportedModel", "DiscoverImportedModelsRequest", "DiscoverImportedModelsResponse", @@ -85,6 +93,7 @@ "SecretCreate", "SecretList", "SecretUpdate", + "SupportedArchitectures", "TestConnectionRequest", "TestConnectionResponse", "TestConnectionResponseUsage", @@ -93,5 +102,6 @@ "ValidateModelRequest", "ValidateModelRequestQuantization", "ValidateModelResponse", + "ValidateModelResponseArtifactType", "WeightSource", ) diff --git a/src/graphn/_generated/models/architecture_info.py b/src/graphn/_generated/models/architecture_info.py new file mode 100644 index 0000000..168c72d --- /dev/null +++ b/src/graphn/_generated/models/architecture_info.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, TypeVar, cast + +from attrs import define as _attrs_define + +T = TypeVar("T", bound="ArchitectureInfo") + + +@_attrs_define +class ArchitectureInfo: + """ + Attributes: + name (str): HuggingFace `architectures[0]` value (e.g. `LlamaForCausalLM`, + `Qwen3VLMoeForConditionalGeneration`). + capabilities (list[str]): Capability tags this architecture exposes — `tool_calling`, + `vision`, `image_input`, `video_input`, `streaming`, `json_mode`. + Drives the UI capability filters and AF's per-feature gating. + """ + + name: str + capabilities: list[str] + + def to_dict(self) -> dict[str, Any]: + name = self.name + + capabilities = self.capabilities + + field_dict: dict[str, Any] = {} + + field_dict.update( + { + "name": name, + "capabilities": capabilities, + } + ) + + return field_dict + + @classmethod + def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T: + d = dict(src_dict) + name = d.pop("name") + + capabilities = cast(list[str], d.pop("capabilities")) + + architecture_info = cls( + name=name, + capabilities=capabilities, + ) + + return architecture_info diff --git a/src/graphn/_generated/models/custom_model.py b/src/graphn/_generated/models/custom_model.py index 60e86f0..b39c286 100644 --- a/src/graphn/_generated/models/custom_model.py +++ b/src/graphn/_generated/models/custom_model.py @@ -9,6 +9,7 @@ from dateutil.parser import isoparse from ..models.capability import Capability +from ..models.custom_model_artifact_type import CustomModelArtifactType from ..models.custom_model_quantization import CustomModelQuantization from ..models.custom_model_status import CustomModelStatus from ..models.weight_source import WeightSource @@ -50,6 +51,29 @@ class CustomModel: huggingface_model_id (str | Unset): Set when `weight_source` is `huggingface`. s3_url (None | str | Unset): Set when `weight_source` is `s3_presigned` or `s3_assume_role`. s3_role_arn (None | str | Unset): Set when `weight_source` is `s3_assume_role`. + artifact_type (CustomModelArtifactType | Unset): Whether this import is a full base checkpoint or a LoRA + adapter on top of an allowlisted base. Set eagerly at + create-time: + + * **`huggingface`** imports are classified by probing + `adapter_config.json` on the upstream repo. + * **`s3_*`** imports are classified as `lora` iff + `base_model_id` is supplied on the create request; + otherwise the bundle is treated as `base`. If the + downloaded S3 bundle later turns out to be a LoRA + adapter (caller forgot the hint), the model deploys to + `failed` with an actionable `error_message`. + base_model_id (None | str | Unset): Populated when `artifact_type=lora`. The base model id the + adapter loads on top of. For HuggingFace imports this is + either `adapter_config.json::base_model_name_or_path` or + the caller's `base_model_id` override on + `CustomModelCreate` (the override wins on disagreement). + For S3 imports it is the caller-supplied + `base_model_id` from `CustomModelCreate`. + lora_adapter_name (None | str | Unset): vLLM routing name the LoRA adapter is served under. + Defaults to the model's short name; clients address it + via `model=` in chat completions. + lora_rank (int | None | Unset): `r` value from `adapter_config.json` when `artifact_type=lora`. max_model_len (int | None | Unset): Maximum context length in tokens. quantization (CustomModelQuantization | Unset): Weight quantization scheme, if any. replicas_available (int | None | Unset): Currently serving replicas (live status). @@ -78,6 +102,10 @@ class CustomModel: huggingface_model_id: str | Unset = UNSET s3_url: None | str | Unset = UNSET s3_role_arn: None | str | Unset = UNSET + artifact_type: CustomModelArtifactType | Unset = UNSET + base_model_id: None | str | Unset = UNSET + lora_adapter_name: None | str | Unset = UNSET + lora_rank: int | None | Unset = UNSET max_model_len: int | None | Unset = UNSET quantization: CustomModelQuantization | Unset = UNSET replicas_available: int | None | Unset = UNSET @@ -134,6 +162,28 @@ def to_dict(self) -> dict[str, Any]: else: s3_role_arn = self.s3_role_arn + artifact_type: str | Unset = UNSET + if not isinstance(self.artifact_type, Unset): + artifact_type = self.artifact_type.value + + base_model_id: None | str | Unset + if isinstance(self.base_model_id, Unset): + base_model_id = UNSET + else: + base_model_id = self.base_model_id + + lora_adapter_name: None | str | Unset + if isinstance(self.lora_adapter_name, Unset): + lora_adapter_name = UNSET + else: + lora_adapter_name = self.lora_adapter_name + + lora_rank: int | None | Unset + if isinstance(self.lora_rank, Unset): + lora_rank = UNSET + else: + lora_rank = self.lora_rank + max_model_len: int | None | Unset if isinstance(self.max_model_len, Unset): max_model_len = UNSET @@ -210,6 +260,14 @@ def to_dict(self) -> dict[str, Any]: field_dict["s3_url"] = s3_url if s3_role_arn is not UNSET: field_dict["s3_role_arn"] = s3_role_arn + if artifact_type is not UNSET: + field_dict["artifact_type"] = artifact_type + if base_model_id is not UNSET: + field_dict["base_model_id"] = base_model_id + if lora_adapter_name is not UNSET: + field_dict["lora_adapter_name"] = lora_adapter_name + if lora_rank is not UNSET: + field_dict["lora_rank"] = lora_rank if max_model_len is not UNSET: field_dict["max_model_len"] = max_model_len if quantization is not UNSET: @@ -285,6 +343,40 @@ def _parse_s3_role_arn(data: object) -> None | str | Unset: s3_role_arn = _parse_s3_role_arn(d.pop("s3_role_arn", UNSET)) + _artifact_type = d.pop("artifact_type", UNSET) + artifact_type: CustomModelArtifactType | Unset + if isinstance(_artifact_type, Unset): + artifact_type = UNSET + else: + artifact_type = CustomModelArtifactType(_artifact_type) + + def _parse_base_model_id(data: object) -> None | str | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(None | str | Unset, data) + + base_model_id = _parse_base_model_id(d.pop("base_model_id", UNSET)) + + def _parse_lora_adapter_name(data: object) -> None | str | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(None | str | Unset, data) + + lora_adapter_name = _parse_lora_adapter_name(d.pop("lora_adapter_name", UNSET)) + + def _parse_lora_rank(data: object) -> int | None | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(int | None | Unset, data) + + lora_rank = _parse_lora_rank(d.pop("lora_rank", UNSET)) + def _parse_max_model_len(data: object) -> int | None | Unset: if data is None: return data @@ -385,6 +477,10 @@ def _parse_architectures(data: object) -> list[str] | None | Unset: huggingface_model_id=huggingface_model_id, s3_url=s3_url, s3_role_arn=s3_role_arn, + artifact_type=artifact_type, + base_model_id=base_model_id, + lora_adapter_name=lora_adapter_name, + lora_rank=lora_rank, max_model_len=max_model_len, quantization=quantization, replicas_available=replicas_available, diff --git a/src/graphn/_generated/models/custom_model_artifact_type.py b/src/graphn/_generated/models/custom_model_artifact_type.py new file mode 100644 index 0000000..f2adb23 --- /dev/null +++ b/src/graphn/_generated/models/custom_model_artifact_type.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class CustomModelArtifactType(str, Enum): + BASE = "base" + LORA = "lora" + + def __str__(self) -> str: + return str(self.value) diff --git a/src/graphn/_generated/models/custom_model_create.py b/src/graphn/_generated/models/custom_model_create.py index 6eeb477..49f1ac9 100644 --- a/src/graphn/_generated/models/custom_model_create.py +++ b/src/graphn/_generated/models/custom_model_create.py @@ -36,9 +36,14 @@ class CustomModelCreate: `s3_assume_role`. Conditional requirement is enforced by the server (returns 422); not encoded as a JSON Schema keyword for OAS-3.0-tooling compatibility. - s3_role_arn (str | Unset): Required when `weight_source` is `s3_assume_role`. - Conditional requirement is enforced by the server (returns - 422); not encoded as a JSON Schema keyword. + s3_role_arn (str | Unset): Required when `weight_source` is `s3_assume_role`. The role + name (the segment after `:role/`) must start with + `graphn-byom-`; GraphN's platform IAM policy is scoped to + that prefix as a defense-in-depth boundary, and the + customer-facing CloudFormation template enforces the same + constraint at stack-create time. Conditional requirement + (s3_assume_role only) is enforced by the server (returns + 422); the format itself is checked against this pattern. hf_token_secret_id (str | Unset): ID of a workspace secret holding a HuggingFace access token. Required for gated HuggingFace models. gpu_count (int | Unset): Default: 1. @@ -49,6 +54,27 @@ class CustomModelCreate: min_replicas (int | Unset): Default: 0. max_replicas (int | Unset): Default: 1. cooldown_seconds (int | Unset): Default: 600. + base_model_id (str | Unset): Override / hint for LoRA imports. Must be one of the + platform's allowlisted base models (see + `GET /v1/{workspaceId}/custom-models/supported-architectures`). + + * **`weight_source=s3_*`**: this is the **only** way to + classify the bundle as a LoRA adapter at create-time -- + omitting it routes the import through the base path, + and a bundle that later turns out to be a LoRA adapter + will deploy to `failed` with an actionable error + ("re-create with `base_model_id` set"). + * **`weight_source=huggingface`**: the field **overrides** + `adapter_config.json::base_model_name_or_path` from the + adapter repo. Useful for adapters trained against a + local filesystem path (e.g. `C:/users/.../base`) whose + recorded base id isn't a valid HF id. When the override + disagrees with the adapter's declared base the caller's + value wins; the disagreement is logged server-side for + debuggability. + + Ignored when the resolved artifact type is `base`. + Example: Qwen/Qwen3.5-4B. """ name: str @@ -66,6 +92,7 @@ class CustomModelCreate: min_replicas: int | Unset = 0 max_replicas: int | Unset = 1 cooldown_seconds: int | Unset = 600 + base_model_id: str | Unset = UNSET def to_dict(self) -> dict[str, Any]: name = self.name @@ -107,6 +134,8 @@ def to_dict(self) -> dict[str, Any]: cooldown_seconds = self.cooldown_seconds + base_model_id = self.base_model_id + field_dict: dict[str, Any] = {} field_dict.update( @@ -141,6 +170,8 @@ def to_dict(self) -> dict[str, Any]: field_dict["max_replicas"] = max_replicas if cooldown_seconds is not UNSET: field_dict["cooldown_seconds"] = cooldown_seconds + if base_model_id is not UNSET: + field_dict["base_model_id"] = base_model_id return field_dict @@ -194,6 +225,8 @@ def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T: cooldown_seconds = d.pop("cooldown_seconds", UNSET) + base_model_id = d.pop("base_model_id", UNSET) + custom_model_create = cls( name=name, huggingface_model_id=huggingface_model_id, @@ -210,6 +243,7 @@ def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T: min_replicas=min_replicas, max_replicas=max_replicas, cooldown_seconds=cooldown_seconds, + base_model_id=base_model_id, ) return custom_model_create diff --git a/src/graphn/_generated/models/custom_model_update.py b/src/graphn/_generated/models/custom_model_update.py new file mode 100644 index 0000000..da74ce8 --- /dev/null +++ b/src/graphn/_generated/models/custom_model_update.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, TypeVar + +from attrs import define as _attrs_define + +from ..types import UNSET, Unset + +T = TypeVar("T", bound="CustomModelUpdate") + + +@_attrs_define +class CustomModelUpdate: + """Partial-update payload for `PATCH /v1/{workspaceId}/custom-models/{modelId}`. + All fields are independently optional; omitted fields are left + unchanged. At least one field MUST be supplied. + + Only a small, vetted set of fields are mutable post-create. + Immutable fields (`huggingface_model_id`, `weight_source`, + GPU topology, …) are not exposed here — change them by + deleting and re-creating the model. + + Attributes: + name (str | Unset): Display name. Persisted to the model record only. + min_replicas (int | Unset): New floor for the autoscaler. `0` re-enables scale-to-zero; + any value `>= 1` keeps the model warm. Applied to the live + deployment in place — no rolling restart, no downtime. + max_replicas (int | Unset): New ceiling for the autoscaler. KServe propagates the value + to the underlying KEDA `ScaledObject`'s `maxReplicaCount` on + its next reconcile. Applied to the live deployment in place. + cooldown_seconds (int | Unset): Idle period (in seconds) the controller waits before scaling + an idle replica back to zero. Persisted to the model record; + the controller picks it up on the next reconcile. + """ + + name: str | Unset = UNSET + min_replicas: int | Unset = UNSET + max_replicas: int | Unset = UNSET + cooldown_seconds: int | Unset = UNSET + + def to_dict(self) -> dict[str, Any]: + name = self.name + + min_replicas = self.min_replicas + + max_replicas = self.max_replicas + + cooldown_seconds = self.cooldown_seconds + + field_dict: dict[str, Any] = {} + + field_dict.update({}) + if name is not UNSET: + field_dict["name"] = name + if min_replicas is not UNSET: + field_dict["min_replicas"] = min_replicas + if max_replicas is not UNSET: + field_dict["max_replicas"] = max_replicas + if cooldown_seconds is not UNSET: + field_dict["cooldown_seconds"] = cooldown_seconds + + return field_dict + + @classmethod + def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T: + d = dict(src_dict) + name = d.pop("name", UNSET) + + min_replicas = d.pop("min_replicas", UNSET) + + max_replicas = d.pop("max_replicas", UNSET) + + cooldown_seconds = d.pop("cooldown_seconds", UNSET) + + custom_model_update = cls( + name=name, + min_replicas=min_replicas, + max_replicas=max_replicas, + cooldown_seconds=cooldown_seconds, + ) + + return custom_model_update diff --git a/src/graphn/_generated/models/supported_architectures.py b/src/graphn/_generated/models/supported_architectures.py new file mode 100644 index 0000000..6133402 --- /dev/null +++ b/src/graphn/_generated/models/supported_architectures.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, TypeVar + +from attrs import define as _attrs_define + +if TYPE_CHECKING: + from ..models.architecture_info import ArchitectureInfo + + +T = TypeVar("T", bound="SupportedArchitectures") + + +@_attrs_define +class SupportedArchitectures: + """ + Attributes: + architectures (list[ArchitectureInfo]): Sorted (by `name`) list of architectures the platform's serving + runtimes can deploy. + """ + + architectures: list[ArchitectureInfo] + + def to_dict(self) -> dict[str, Any]: + architectures = [] + for architectures_item_data in self.architectures: + architectures_item = architectures_item_data.to_dict() + architectures.append(architectures_item) + + field_dict: dict[str, Any] = {} + + field_dict.update( + { + "architectures": architectures, + } + ) + + return field_dict + + @classmethod + def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T: + from ..models.architecture_info import ArchitectureInfo + + d = dict(src_dict) + architectures = [] + _architectures = d.pop("architectures") + for architectures_item_data in _architectures: + architectures_item = ArchitectureInfo.from_dict(architectures_item_data) + + architectures.append(architectures_item) + + supported_architectures = cls( + architectures=architectures, + ) + + return supported_architectures diff --git a/src/graphn/_generated/models/validate_model_request.py b/src/graphn/_generated/models/validate_model_request.py index acf74bb..197b62f 100644 --- a/src/graphn/_generated/models/validate_model_request.py +++ b/src/graphn/_generated/models/validate_model_request.py @@ -21,12 +21,18 @@ class ValidateModelRequest: hf_token_secret_id (None | str | Unset): ID of a workspace secret holding a HuggingFace token. quantization (ValidateModelRequestQuantization | Unset): gpu_memory_utilization (float | Unset): Default: 0.9. + model_size_gb (int | None | Unset): Optional caller-supplied estimate of the on-disk weights size, + in GiB. When provided, the platform sizes the model-weights PVC + from this hint instead of waiting for a HuggingFace head-bytes + probe; useful for very large models where the probe would + otherwise stall the validate response. """ huggingface_model_id: str hf_token_secret_id: None | str | Unset = UNSET quantization: ValidateModelRequestQuantization | Unset = UNSET gpu_memory_utilization: float | Unset = 0.9 + model_size_gb: int | None | Unset = UNSET def to_dict(self) -> dict[str, Any]: huggingface_model_id = self.huggingface_model_id @@ -43,6 +49,12 @@ def to_dict(self) -> dict[str, Any]: gpu_memory_utilization = self.gpu_memory_utilization + model_size_gb: int | None | Unset + if isinstance(self.model_size_gb, Unset): + model_size_gb = UNSET + else: + model_size_gb = self.model_size_gb + field_dict: dict[str, Any] = {} field_dict.update( @@ -56,6 +68,8 @@ def to_dict(self) -> dict[str, Any]: field_dict["quantization"] = quantization if gpu_memory_utilization is not UNSET: field_dict["gpu_memory_utilization"] = gpu_memory_utilization + if model_size_gb is not UNSET: + field_dict["model_size_gb"] = model_size_gb return field_dict @@ -84,11 +98,21 @@ def _parse_hf_token_secret_id(data: object) -> None | str | Unset: gpu_memory_utilization = d.pop("gpu_memory_utilization", UNSET) + def _parse_model_size_gb(data: object) -> int | None | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(int | None | Unset, data) + + model_size_gb = _parse_model_size_gb(d.pop("model_size_gb", UNSET)) + validate_model_request = cls( huggingface_model_id=huggingface_model_id, hf_token_secret_id=hf_token_secret_id, quantization=quantization, gpu_memory_utilization=gpu_memory_utilization, + model_size_gb=model_size_gb, ) return validate_model_request diff --git a/src/graphn/_generated/models/validate_model_response.py b/src/graphn/_generated/models/validate_model_response.py index 35ba902..b2519ba 100644 --- a/src/graphn/_generated/models/validate_model_response.py +++ b/src/graphn/_generated/models/validate_model_response.py @@ -6,6 +6,9 @@ from attrs import define as _attrs_define from attrs import field as _attrs_field +from ..models.validate_model_response_artifact_type import ( + ValidateModelResponseArtifactType, +) from ..types import UNSET, Unset T = TypeVar("T", bound="ValidateModelResponse") @@ -22,6 +25,24 @@ class ValidateModelResponse: num_params (int | None | Unset): estimated_memory_gb (float | None | Unset): max_context_length (int | None | Unset): + artifact_type (ValidateModelResponseArtifactType | Unset): `lora` when AF detected an `adapter_config.json` in + the HuggingFace + repo at validate time; `base` otherwise (the default — what every + existing caller saw before the LoRA auto-detect work landed). Use + this to branch in client code without keeping track of two + different `weight_source` enum values for the HF case. + + When `artifact_type=lora`, the `architectures`, `num_params`, + `estimated_memory_gb`, and `max_context_length` fields describe + the **base** model (resolved from `adapter_config.json`), not + the adapter itself. + Default: ValidateModelResponseArtifactType.BASE. + detected_base_model_id (None | str | Unset): Populated only when `artifact_type=lora`. The base model id read + from `adapter_config.json::base_model_name_or_path`. Use to pin + the base on subsequent deploy calls or to surface a "detected as + LoRA adapter for X" affordance in your UI. + lora_rank (int | None | Unset): Populated only when `artifact_type=lora`. The `r` value from + `adapter_config.json` (LoRA rank). """ valid: bool @@ -31,6 +52,11 @@ class ValidateModelResponse: num_params: int | None | Unset = UNSET estimated_memory_gb: float | None | Unset = UNSET max_context_length: int | None | Unset = UNSET + artifact_type: ValidateModelResponseArtifactType | Unset = ( + ValidateModelResponseArtifactType.BASE + ) + detected_base_model_id: None | str | Unset = UNSET + lora_rank: int | None | Unset = UNSET additional_properties: dict[str, Any] = _attrs_field(init=False, factory=dict) def to_dict(self) -> dict[str, Any]: @@ -66,6 +92,22 @@ def to_dict(self) -> dict[str, Any]: else: max_context_length = self.max_context_length + artifact_type: str | Unset = UNSET + if not isinstance(self.artifact_type, Unset): + artifact_type = self.artifact_type.value + + detected_base_model_id: None | str | Unset + if isinstance(self.detected_base_model_id, Unset): + detected_base_model_id = UNSET + else: + detected_base_model_id = self.detected_base_model_id + + lora_rank: int | None | Unset + if isinstance(self.lora_rank, Unset): + lora_rank = UNSET + else: + lora_rank = self.lora_rank + field_dict: dict[str, Any] = {} field_dict.update(self.additional_properties) field_dict.update( @@ -85,6 +127,12 @@ def to_dict(self) -> dict[str, Any]: field_dict["estimated_memory_gb"] = estimated_memory_gb if max_context_length is not UNSET: field_dict["max_context_length"] = max_context_length + if artifact_type is not UNSET: + field_dict["artifact_type"] = artifact_type + if detected_base_model_id is not UNSET: + field_dict["detected_base_model_id"] = detected_base_model_id + if lora_rank is not UNSET: + field_dict["lora_rank"] = lora_rank return field_dict @@ -137,6 +185,33 @@ def _parse_max_context_length(data: object) -> int | None | Unset: d.pop("max_context_length", UNSET) ) + _artifact_type = d.pop("artifact_type", UNSET) + artifact_type: ValidateModelResponseArtifactType | Unset + if isinstance(_artifact_type, Unset): + artifact_type = UNSET + else: + artifact_type = ValidateModelResponseArtifactType(_artifact_type) + + def _parse_detected_base_model_id(data: object) -> None | str | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(None | str | Unset, data) + + detected_base_model_id = _parse_detected_base_model_id( + d.pop("detected_base_model_id", UNSET) + ) + + def _parse_lora_rank(data: object) -> int | None | Unset: + if data is None: + return data + if isinstance(data, Unset): + return data + return cast(int | None | Unset, data) + + lora_rank = _parse_lora_rank(d.pop("lora_rank", UNSET)) + validate_model_response = cls( valid=valid, error=error, @@ -145,6 +220,9 @@ def _parse_max_context_length(data: object) -> int | None | Unset: num_params=num_params, estimated_memory_gb=estimated_memory_gb, max_context_length=max_context_length, + artifact_type=artifact_type, + detected_base_model_id=detected_base_model_id, + lora_rank=lora_rank, ) validate_model_response.additional_properties = d diff --git a/src/graphn/_generated/models/validate_model_response_artifact_type.py b/src/graphn/_generated/models/validate_model_response_artifact_type.py new file mode 100644 index 0000000..07adb92 --- /dev/null +++ b/src/graphn/_generated/models/validate_model_response_artifact_type.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class ValidateModelResponseArtifactType(str, Enum): + BASE = "base" + LORA = "lora" + + def __str__(self) -> str: + return str(self.value)