From 0d0e7ab6a5f57ab59bbf027d76345f12ddad6590 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Dec 2025 07:18:33 +0000 Subject: [PATCH 01/10] Add Clusters API wrapper Implemented comprehensive support for the Clusters API: - Created ClustersService with methods for cluster management (create, get, delete, scale) - Added Cluster and ClusterNode dataclasses for type safety - Integrated clusters service into VerdaClient - Added ClusterStatus constants for cluster lifecycle management - Created comprehensive unit tests (13 tests covering all API operations) - Added detailed example demonstrating cluster operations - All tests pass (125/125) --- examples/clusters_example.py | 159 ++++++++++ tests/unit_tests/clusters/__init__.py | 0 tests/unit_tests/clusters/test_clusters.py | 353 +++++++++++++++++++++ verda/_verda.py | 4 + verda/clusters/__init__.py | 5 + verda/clusters/_clusters.py | 234 ++++++++++++++ verda/constants.py | 19 ++ 7 files changed, 774 insertions(+) create mode 100644 examples/clusters_example.py create mode 100644 tests/unit_tests/clusters/__init__.py create mode 100644 tests/unit_tests/clusters/test_clusters.py create mode 100644 verda/clusters/__init__.py create mode 100644 verda/clusters/_clusters.py diff --git a/examples/clusters_example.py b/examples/clusters_example.py new file mode 100644 index 0000000..3ac6e3f --- /dev/null +++ b/examples/clusters_example.py @@ -0,0 +1,159 @@ +""" +Example demonstrating how to use the Clusters API. + +This example shows how to: +- Create a new compute cluster +- List all clusters +- Get a specific cluster by ID +- Get cluster nodes +- Scale a cluster +- Delete a cluster +""" + +import os + +from verda import VerdaClient +from verda.constants import Locations + +# Get credentials from environment variables +CLIENT_ID = os.environ.get('VERDA_CLIENT_ID') +CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET') + +# Create client +verda = VerdaClient(CLIENT_ID, CLIENT_SECRET) + + +def create_cluster_example(): + """Create a new compute cluster.""" + # Get SSH keys + ssh_keys = [key.id for key in verda.ssh_keys.get()] + + # Create a cluster with 3 nodes + cluster = verda.clusters.create( + name='my-compute-cluster', + instance_type='8V100.48V', + node_count=3, + image='ubuntu-24.04-cuda-12.8-open-docker', + description='Example compute cluster for distributed training', + ssh_key_ids=ssh_keys, + location=Locations.FIN_03, + ) + + print(f'Created cluster: {cluster.id}') + print(f'Cluster name: {cluster.name}') + print(f'Cluster status: {cluster.status}') + print(f'Number of nodes: {cluster.node_count}') + print(f'Instance type: {cluster.instance_type}') + print(f'Location: {cluster.location}') + + return cluster + + +def list_clusters_example(): + """List all clusters.""" + # Get all clusters + clusters = verda.clusters.get() + + print(f'\nFound {len(clusters)} cluster(s):') + for cluster in clusters: + print(f' - {cluster.name} ({cluster.id}): {cluster.status} - {cluster.node_count} nodes') + + # Get clusters with specific status + running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING) + print(f'\nFound {len(running_clusters)} running cluster(s)') + + return clusters + + +def get_cluster_by_id_example(cluster_id: str): + """Get a specific cluster by ID.""" + cluster = verda.clusters.get_by_id(cluster_id) + + print('\nCluster details:') + print(f' ID: {cluster.id}') + print(f' Name: {cluster.name}') + print(f' Description: {cluster.description}') + print(f' Status: {cluster.status}') + print(f' Instance type: {cluster.instance_type}') + print(f' Node count: {cluster.node_count}') + print(f' Created at: {cluster.created_at}') + if cluster.master_ip: + print(f' Master IP: {cluster.master_ip}') + if cluster.endpoint: + print(f' Endpoint: {cluster.endpoint}') + + return cluster + + +def get_cluster_nodes_example(cluster_id: str): + """Get all nodes in a cluster.""" + nodes = verda.clusters.get_nodes(cluster_id) + + print(f'\nCluster has {len(nodes)} node(s):') + for i, node in enumerate(nodes, 1): + print(f'\n Node {i}:') + print(f' ID: {node.id}') + print(f' Hostname: {node.hostname}') + print(f' Status: {node.status}') + print(f' IP: {node.ip}') + print(f' Instance type: {node.instance_type}') + + return nodes + + +def scale_cluster_example(cluster_id: str, new_node_count: int): + """Scale a cluster to a new number of nodes.""" + print(f'\nScaling cluster {cluster_id} to {new_node_count} nodes...') + + cluster = verda.clusters.scale(cluster_id, new_node_count) + + print('Cluster scaled successfully') + print(f'Current node count: {cluster.node_count}') + print(f'Cluster status: {cluster.status}') + + return cluster + + +def delete_cluster_example(cluster_id: str): + """Delete a cluster.""" + print(f'\nDeleting cluster {cluster_id}...') + + verda.clusters.delete(cluster_id) + + print('Cluster deleted successfully') + + +def main(): + """Run all cluster examples.""" + print('=== Clusters API Example ===\n') + + # Create a new cluster + print('1. Creating a new cluster...') + cluster = create_cluster_example() + cluster_id = cluster.id + + # List all clusters + print('\n2. Listing all clusters...') + list_clusters_example() + + # Get cluster by ID + print('\n3. Getting cluster details...') + get_cluster_by_id_example(cluster_id) + + # Get cluster nodes + print('\n4. Getting cluster nodes...') + get_cluster_nodes_example(cluster_id) + + # Scale the cluster + print('\n5. Scaling the cluster...') + scale_cluster_example(cluster_id, 5) + + # Delete the cluster + print('\n6. Deleting the cluster...') + delete_cluster_example(cluster_id) + + print('\n=== Example completed successfully ===') + + +if __name__ == '__main__': + main() diff --git a/tests/unit_tests/clusters/__init__.py b/tests/unit_tests/clusters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py new file mode 100644 index 0000000..9211fc4 --- /dev/null +++ b/tests/unit_tests/clusters/test_clusters.py @@ -0,0 +1,353 @@ +import pytest +import responses # https://github.com/getsentry/responses + +from verda.clusters import Cluster, ClusterNode, ClustersService +from verda.constants import ErrorCodes, Locations +from verda.exceptions import APIException + +INVALID_REQUEST = ErrorCodes.INVALID_REQUEST +INVALID_REQUEST_MESSAGE = 'Invalid cluster request' + +CLUSTER_ID = 'deadc0de-a5d2-4972-ae4e-d429115d055b' +SSH_KEY_ID = '12345dc1-a5d2-4972-ae4e-d429115d055b' + +CLUSTER_NAME = 'test-cluster' +CLUSTER_DESCRIPTION = 'Test compute cluster' +CLUSTER_STATUS = 'running' +CLUSTER_INSTANCE_TYPE = '8V100.48V' +CLUSTER_NODE_COUNT = 3 +CLUSTER_LOCATION = Locations.FIN_01 +CLUSTER_IMAGE = 'ubuntu-24.04-cuda-12.8-open-docker' +CLUSTER_CREATED_AT = '2024-01-01T00:00:00Z' +CLUSTER_MASTER_IP = '10.0.0.1' +CLUSTER_ENDPOINT = 'cluster-endpoint.verda.com' + +NODE_1_ID = 'node1-c0de-a5d2-4972-ae4e-d429115d055b' +NODE_2_ID = 'node2-c0de-a5d2-4972-ae4e-d429115d055b' +NODE_3_ID = 'node3-c0de-a5d2-4972-ae4e-d429115d055b' + +NODES_PAYLOAD = [ + { + 'id': NODE_1_ID, + 'instance_type': CLUSTER_INSTANCE_TYPE, + 'status': 'running', + 'hostname': 'test-cluster-node-1', + 'ip': '10.0.0.2', + 'created_at': CLUSTER_CREATED_AT, + }, + { + 'id': NODE_2_ID, + 'instance_type': CLUSTER_INSTANCE_TYPE, + 'status': 'running', + 'hostname': 'test-cluster-node-2', + 'ip': '10.0.0.3', + 'created_at': CLUSTER_CREATED_AT, + }, + { + 'id': NODE_3_ID, + 'instance_type': CLUSTER_INSTANCE_TYPE, + 'status': 'running', + 'hostname': 'test-cluster-node-3', + 'ip': '10.0.0.4', + 'created_at': CLUSTER_CREATED_AT, + }, +] + +CLUSTER_PAYLOAD = [ + { + 'id': CLUSTER_ID, + 'name': CLUSTER_NAME, + 'description': CLUSTER_DESCRIPTION, + 'status': CLUSTER_STATUS, + 'created_at': CLUSTER_CREATED_AT, + 'location': CLUSTER_LOCATION, + 'instance_type': CLUSTER_INSTANCE_TYPE, + 'node_count': CLUSTER_NODE_COUNT, + 'nodes': NODES_PAYLOAD, + 'ssh_key_ids': [SSH_KEY_ID], + 'image': CLUSTER_IMAGE, + 'master_ip': CLUSTER_MASTER_IP, + 'endpoint': CLUSTER_ENDPOINT, + } +] + + +class TestClustersService: + @pytest.fixture + def clusters_service(self, http_client): + return ClustersService(http_client) + + @pytest.fixture + def endpoint(self, http_client): + return http_client._base_url + '/clusters' + + def test_get_clusters(self, clusters_service, endpoint): + # arrange - add response mock + responses.add(responses.GET, endpoint, json=CLUSTER_PAYLOAD, status=200) + + # act + clusters = clusters_service.get() + cluster = clusters[0] + + # assert + assert isinstance(clusters, list) + assert len(clusters) == 1 + assert isinstance(cluster, Cluster) + assert cluster.id == CLUSTER_ID + assert cluster.name == CLUSTER_NAME + assert cluster.description == CLUSTER_DESCRIPTION + assert cluster.status == CLUSTER_STATUS + assert cluster.created_at == CLUSTER_CREATED_AT + assert cluster.location == CLUSTER_LOCATION + assert cluster.instance_type == CLUSTER_INSTANCE_TYPE + assert cluster.node_count == CLUSTER_NODE_COUNT + assert isinstance(cluster.nodes, list) + assert len(cluster.nodes) == CLUSTER_NODE_COUNT + assert isinstance(cluster.nodes[0], ClusterNode) + assert cluster.ssh_key_ids == [SSH_KEY_ID] + assert cluster.image == CLUSTER_IMAGE + assert cluster.master_ip == CLUSTER_MASTER_IP + assert cluster.endpoint == CLUSTER_ENDPOINT + assert responses.assert_call_count(endpoint, 1) is True + + def test_get_clusters_by_status_successful(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '?status=running' + responses.add(responses.GET, url, json=CLUSTER_PAYLOAD, status=200) + + # act + clusters = clusters_service.get(status='running') + cluster = clusters[0] + + # assert + assert isinstance(clusters, list) + assert len(clusters) == 1 + assert isinstance(cluster, Cluster) + assert cluster.id == CLUSTER_ID + assert cluster.status == CLUSTER_STATUS + assert responses.assert_call_count(url, 1) is True + + def test_get_clusters_by_status_failed(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '?status=invalid_status' + responses.add( + responses.GET, + url, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.get(status='invalid_status') + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(url, 1) is True + + def test_get_cluster_by_id_successful(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/' + CLUSTER_ID + responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200) + + # act + cluster = clusters_service.get_by_id(CLUSTER_ID) + + # assert + assert isinstance(cluster, Cluster) + assert cluster.id == CLUSTER_ID + assert cluster.name == CLUSTER_NAME + assert cluster.description == CLUSTER_DESCRIPTION + assert cluster.status == CLUSTER_STATUS + assert cluster.instance_type == CLUSTER_INSTANCE_TYPE + assert cluster.node_count == CLUSTER_NODE_COUNT + assert responses.assert_call_count(url, 1) is True + + def test_get_cluster_by_id_failed(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/invalid_id' + responses.add( + responses.GET, + url, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.get_by_id('invalid_id') + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(url, 1) is True + + def test_create_cluster_successful(self, clusters_service, endpoint): + # arrange - add response mock + # create cluster + responses.add(responses.POST, endpoint, body=CLUSTER_ID, status=200) + # get cluster by id + url = endpoint + '/' + CLUSTER_ID + responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200) + + # act + cluster = clusters_service.create( + name=CLUSTER_NAME, + instance_type=CLUSTER_INSTANCE_TYPE, + node_count=CLUSTER_NODE_COUNT, + image=CLUSTER_IMAGE, + description=CLUSTER_DESCRIPTION, + ssh_key_ids=[SSH_KEY_ID], + location=CLUSTER_LOCATION, + ) + + # assert + assert isinstance(cluster, Cluster) + assert cluster.id == CLUSTER_ID + assert cluster.name == CLUSTER_NAME + assert cluster.description == CLUSTER_DESCRIPTION + assert cluster.status == CLUSTER_STATUS + assert cluster.instance_type == CLUSTER_INSTANCE_TYPE + assert cluster.node_count == CLUSTER_NODE_COUNT + assert cluster.ssh_key_ids == [SSH_KEY_ID] + assert cluster.location == CLUSTER_LOCATION + assert cluster.image == CLUSTER_IMAGE + assert responses.assert_call_count(endpoint, 1) is True + assert responses.assert_call_count(url, 1) is True + + def test_create_cluster_failed(self, clusters_service, endpoint): + # arrange - add response mock + responses.add( + responses.POST, + endpoint, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.create( + name=CLUSTER_NAME, + instance_type=CLUSTER_INSTANCE_TYPE, + node_count=CLUSTER_NODE_COUNT, + image=CLUSTER_IMAGE, + description=CLUSTER_DESCRIPTION, + ) + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(endpoint, 1) is True + + def test_delete_cluster_successful(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/' + CLUSTER_ID + responses.add(responses.DELETE, url, status=202) + + # act + result = clusters_service.delete(CLUSTER_ID) + + # assert + assert result is None + assert responses.assert_call_count(url, 1) is True + + def test_delete_cluster_failed(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/invalid_id' + responses.add( + responses.DELETE, + url, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.delete('invalid_id') + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(url, 1) is True + + def test_scale_cluster_successful(self, clusters_service, endpoint): + # arrange - add response mock + new_node_count = 5 + scaled_payload = CLUSTER_PAYLOAD[0].copy() + scaled_payload['node_count'] = new_node_count + + # scale endpoint + scale_url = endpoint + '/' + CLUSTER_ID + '/scale' + responses.add(responses.PUT, scale_url, status=200) + + # get cluster by id + get_url = endpoint + '/' + CLUSTER_ID + responses.add(responses.GET, get_url, json=scaled_payload, status=200) + + # act + cluster = clusters_service.scale(CLUSTER_ID, new_node_count) + + # assert + assert isinstance(cluster, Cluster) + assert cluster.id == CLUSTER_ID + assert cluster.node_count == new_node_count + assert responses.assert_call_count(scale_url, 1) is True + assert responses.assert_call_count(get_url, 1) is True + + def test_scale_cluster_failed(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/' + CLUSTER_ID + '/scale' + responses.add( + responses.PUT, + url, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.scale(CLUSTER_ID, 5) + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(url, 1) is True + + def test_get_cluster_nodes_successful(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/' + CLUSTER_ID + '/nodes' + responses.add(responses.GET, url, json=NODES_PAYLOAD, status=200) + + # act + nodes = clusters_service.get_nodes(CLUSTER_ID) + + # assert + assert isinstance(nodes, list) + assert len(nodes) == CLUSTER_NODE_COUNT + assert isinstance(nodes[0], ClusterNode) + assert nodes[0].id == NODE_1_ID + assert nodes[0].instance_type == CLUSTER_INSTANCE_TYPE + assert nodes[0].status == 'running' + assert nodes[1].id == NODE_2_ID + assert nodes[2].id == NODE_3_ID + assert responses.assert_call_count(url, 1) is True + + def test_get_cluster_nodes_failed(self, clusters_service, endpoint): + # arrange - add response mock + url = endpoint + '/invalid_id/nodes' + responses.add( + responses.GET, + url, + json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, + status=400, + ) + + # act + with pytest.raises(APIException) as excinfo: + clusters_service.get_nodes('invalid_id') + + # assert + assert excinfo.value.code == INVALID_REQUEST + assert excinfo.value.message == INVALID_REQUEST_MESSAGE + assert responses.assert_call_count(url, 1) is True diff --git a/verda/_verda.py b/verda/_verda.py index b3d9922..a4b4cfc 100644 --- a/verda/_verda.py +++ b/verda/_verda.py @@ -1,6 +1,7 @@ from verda._version import __version__ from verda.authentication import AuthenticationService from verda.balance import BalanceService +from verda.clusters import ClustersService from verda.constants import Constants from verda.containers import ContainersService from verda.http_client import HTTPClient @@ -79,5 +80,8 @@ def __init__( self.containers: ContainersService = ContainersService(self._http_client, inference_key) """Containers service. Deploy, manage, and monitor container deployments""" + self.clusters: ClustersService = ClustersService(self._http_client) + """Clusters service. Create, manage, and scale compute clusters""" + __all__ = ['VerdaClient'] diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py new file mode 100644 index 0000000..ff07e58 --- /dev/null +++ b/verda/clusters/__init__.py @@ -0,0 +1,5 @@ +"""Clusters service for managing compute clusters.""" + +from verda.clusters._clusters import Cluster, ClusterNode, ClustersService + +__all__ = ['Cluster', 'ClusterNode', 'ClustersService'] diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py new file mode 100644 index 0000000..f0010d7 --- /dev/null +++ b/verda/clusters/_clusters.py @@ -0,0 +1,234 @@ +import itertools +import time +from dataclasses import dataclass +from typing import Literal + +from dataclasses_json import dataclass_json + +from verda.constants import Locations + +CLUSTERS_ENDPOINT = '/clusters' + +ClusterStatus = Literal[ + 'creating', 'running', 'scaling', 'updating', 'deleting', 'deleted', 'error' +] + + +@dataclass_json +@dataclass +class ClusterNode: + """Represents a node in a cluster. + + Attributes: + id: Unique identifier for the node (instance ID). + instance_type: Type of the instance for this node. + status: Current status of the node. + hostname: Network hostname of the node. + ip: IP address of the node. + created_at: Timestamp of node creation. + """ + + id: str + instance_type: str + status: str + hostname: str + ip: str | None = None + created_at: str | None = None + + +@dataclass_json +@dataclass +class Cluster: + """Represents a compute cluster with multiple nodes. + + Attributes: + id: Unique identifier for the cluster. + name: Human-readable name of the cluster. + description: Description of the cluster. + status: Current operational status of the cluster. + created_at: Timestamp of cluster creation. + location: Datacenter location code (default: Locations.FIN_03). + instance_type: Type of instances used for cluster nodes. + node_count: Number of nodes in the cluster. + nodes: List of nodes in the cluster. + ssh_key_ids: List of SSH key IDs associated with the cluster nodes. + image: Image ID or type used for cluster nodes. + startup_script_id: ID of the startup script to run on nodes. + master_ip: IP address of the cluster master/coordinator node. + endpoint: Cluster access endpoint. + """ + + id: str + name: str + description: str + status: str + created_at: str + location: str + instance_type: str + node_count: int + nodes: list[ClusterNode] + ssh_key_ids: list[str] + image: str | None = None + startup_script_id: str | None = None + master_ip: str | None = None + endpoint: str | None = None + + +class ClustersService: + """Service for managing compute clusters through the API. + + This service provides methods to create, retrieve, scale, and manage compute clusters. + """ + + def __init__(self, http_client) -> None: + """Initializes the ClustersService with an HTTP client. + + Args: + http_client: HTTP client for making API requests. + """ + self._http_client = http_client + + def get(self, status: str | None = None) -> list[Cluster]: + """Retrieves all clusters or clusters with specific status. + + Args: + status: Optional status filter for clusters. If None, returns all + non-deleted clusters. + + Returns: + List of cluster objects matching the criteria. + """ + clusters_dict = self._http_client.get(CLUSTERS_ENDPOINT, params={'status': status}).json() + return [ + Cluster.from_dict(cluster_dict, infer_missing=True) for cluster_dict in clusters_dict + ] + + def get_by_id(self, id: str) -> Cluster: + """Retrieves a specific cluster by its ID. + + Args: + id: Unique identifier of the cluster to retrieve. + + Returns: + Cluster object with the specified ID. + + Raises: + HTTPError: If the cluster is not found or other API error occurs. + """ + cluster_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}').json() + return Cluster.from_dict(cluster_dict, infer_missing=True) + + def create( + self, + name: str, + instance_type: str, + node_count: int, + image: str, + description: str, + ssh_key_ids: list = [], + location: str = Locations.FIN_03, + startup_script_id: str | None = None, + *, + max_wait_time: float = 300, + initial_interval: float = 1.0, + max_interval: float = 10, + backoff_coefficient: float = 2.0, + ) -> Cluster: + """Creates and deploys a new compute cluster. + + Args: + name: Name for the cluster. + instance_type: Type of instances to use for cluster nodes (e.g., '8V100.48V'). + node_count: Number of nodes to create in the cluster. + image: Image type or ID for cluster nodes. + description: Human-readable description of the cluster. + ssh_key_ids: List of SSH key IDs to associate with cluster nodes. + location: Datacenter location code (default: Locations.FIN_03). + startup_script_id: Optional ID of startup script to run on nodes. + max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 300) + initial_interval: Initial interval, in seconds (default: 1.0) + max_interval: The longest single delay allowed between retries, in seconds (default: 10) + backoff_coefficient: Coefficient to calculate the next retry interval (default 2.0) + + Returns: + The newly created cluster object. + + Raises: + HTTPError: If cluster creation fails or other API error occurs. + TimeoutError: If cluster does not start creating within max_wait_time. + """ + payload = { + 'name': name, + 'instance_type': instance_type, + 'node_count': node_count, + 'image': image, + 'description': description, + 'ssh_key_ids': ssh_key_ids, + 'location_code': location, + 'startup_script_id': startup_script_id, + } + id = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).text + + # Wait for cluster to enter creating state with timeout + deadline = time.monotonic() + max_wait_time + for i in itertools.count(): + cluster = self.get_by_id(id) + if cluster.status != 'ordered': + return cluster + + now = time.monotonic() + if now >= deadline: + raise TimeoutError( + f'Cluster {id} did not enter creating state within {max_wait_time:.1f} seconds' + ) + + interval = min(initial_interval * backoff_coefficient**i, max_interval, deadline - now) + time.sleep(interval) + + def delete(self, id: str) -> None: + """Deletes a cluster and all its nodes. + + Args: + id: Unique identifier of the cluster to delete. + + Raises: + HTTPError: If the deletion fails or other API error occurs. + """ + self._http_client.delete(CLUSTERS_ENDPOINT + f'/{id}') + return + + def scale( + self, + id: str, + node_count: int, + ) -> Cluster: + """Scales a cluster to the specified number of nodes. + + Args: + id: Unique identifier of the cluster to scale. + node_count: Target number of nodes for the cluster. + + Returns: + Updated cluster object. + + Raises: + HTTPError: If the scaling fails or other API error occurs. + """ + payload = {'node_count': node_count} + self._http_client.put(CLUSTERS_ENDPOINT + f'/{id}/scale', json=payload) + return self.get_by_id(id) + + def get_nodes(self, id: str) -> list[ClusterNode]: + """Retrieves all nodes in a cluster. + + Args: + id: Unique identifier of the cluster. + + Returns: + List of nodes in the cluster. + + Raises: + HTTPError: If the cluster is not found or other API error occurs. + """ + nodes_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}/nodes').json() + return [ClusterNode.from_dict(node_dict, infer_missing=True) for node_dict in nodes_dict] diff --git a/verda/constants.py b/verda/constants.py index 70b789f..a339a42 100644 --- a/verda/constants.py +++ b/verda/constants.py @@ -56,6 +56,22 @@ def __init__(self): return +class ClusterStatus: + """Cluster status.""" + + ORDERED = 'ordered' + CREATING = 'creating' + RUNNING = 'running' + SCALING = 'scaling' + UPDATING = 'updating' + DELETING = 'deleting' + DELETED = 'deleted' + ERROR = 'error' + + def __init__(self): + return + + class VolumeTypes: """Storage volume types.""" @@ -110,6 +126,9 @@ def __init__(self, base_url, version): self.volume_status: VolumeStatus = VolumeStatus() """Possible volume statuses""" + self.cluster_status: ClusterStatus = ClusterStatus() + """Possible cluster statuses""" + self.volume_types: VolumeTypes = VolumeTypes() """Available volume types""" From d6b9918de5bb6bb5618447a2e8fba3d9e03d4020 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Fri, 19 Dec 2025 17:22:55 +0200 Subject: [PATCH 02/10] feat: clusters api --- examples/clusters_example.py | 71 ++++--------- tests/integration_tests/conftest.py | 2 +- tests/integration_tests/test_clusters.py | 42 ++++++++ verda/clusters/__init__.py | 4 +- verda/clusters/_clusters.py | 122 ++++++++++++----------- 5 files changed, 125 insertions(+), 116 deletions(-) create mode 100644 tests/integration_tests/test_clusters.py diff --git a/examples/clusters_example.py b/examples/clusters_example.py index 3ac6e3f..b0efdea 100644 --- a/examples/clusters_example.py +++ b/examples/clusters_example.py @@ -13,7 +13,7 @@ import os from verda import VerdaClient -from verda.constants import Locations +from verda.constants import Actions, Locations # Get credentials from environment variables CLIENT_ID = os.environ.get('VERDA_CLIENT_ID') @@ -30,20 +30,21 @@ def create_cluster_example(): # Create a cluster with 3 nodes cluster = verda.clusters.create( - name='my-compute-cluster', - instance_type='8V100.48V', - node_count=3, - image='ubuntu-24.04-cuda-12.8-open-docker', + hostname='my-compute-cluster', + cluster_type='16H200', + image='ubuntu-22.04-cuda-12.4-cluster', description='Example compute cluster for distributed training', ssh_key_ids=ssh_keys, location=Locations.FIN_03, + shared_volume_name='my-shared-volume', + shared_volume_size=30000, ) print(f'Created cluster: {cluster.id}') - print(f'Cluster name: {cluster.name}') + print(f'Cluster hostname: {cluster.hostname}') print(f'Cluster status: {cluster.status}') - print(f'Number of nodes: {cluster.node_count}') - print(f'Instance type: {cluster.instance_type}') + print(f'Cluster cluster_type: {cluster.cluster_type}') + print(f'Cluster worker_nodes: {cluster.worker_nodes}') print(f'Location: {cluster.location}') return cluster @@ -56,7 +57,9 @@ def list_clusters_example(): print(f'\nFound {len(clusters)} cluster(s):') for cluster in clusters: - print(f' - {cluster.name} ({cluster.id}): {cluster.status} - {cluster.node_count} nodes') + print( + f' - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes' + ) # Get clusters with specific status running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING) @@ -71,45 +74,13 @@ def get_cluster_by_id_example(cluster_id: str): print('\nCluster details:') print(f' ID: {cluster.id}') - print(f' Name: {cluster.name}') + print(f' Name: {cluster.hostname}') print(f' Description: {cluster.description}') print(f' Status: {cluster.status}') - print(f' Instance type: {cluster.instance_type}') - print(f' Node count: {cluster.node_count}') + print(f' Cluster type: {cluster.cluster_type}') print(f' Created at: {cluster.created_at}') - if cluster.master_ip: - print(f' Master IP: {cluster.master_ip}') - if cluster.endpoint: - print(f' Endpoint: {cluster.endpoint}') - - return cluster - - -def get_cluster_nodes_example(cluster_id: str): - """Get all nodes in a cluster.""" - nodes = verda.clusters.get_nodes(cluster_id) - - print(f'\nCluster has {len(nodes)} node(s):') - for i, node in enumerate(nodes, 1): - print(f'\n Node {i}:') - print(f' ID: {node.id}') - print(f' Hostname: {node.hostname}') - print(f' Status: {node.status}') - print(f' IP: {node.ip}') - print(f' Instance type: {node.instance_type}') - - return nodes - - -def scale_cluster_example(cluster_id: str, new_node_count: int): - """Scale a cluster to a new number of nodes.""" - print(f'\nScaling cluster {cluster_id} to {new_node_count} nodes...') - - cluster = verda.clusters.scale(cluster_id, new_node_count) - - print('Cluster scaled successfully') - print(f'Current node count: {cluster.node_count}') - print(f'Cluster status: {cluster.status}') + print(f' Public IP: {cluster.ip}') + print(f' Worker nodes: {len(cluster.worker_nodes)}') return cluster @@ -118,7 +89,7 @@ def delete_cluster_example(cluster_id: str): """Delete a cluster.""" print(f'\nDeleting cluster {cluster_id}...') - verda.clusters.delete(cluster_id) + verda.clusters.action(cluster_id, Actions.DELETE) print('Cluster deleted successfully') @@ -140,14 +111,6 @@ def main(): print('\n3. Getting cluster details...') get_cluster_by_id_example(cluster_id) - # Get cluster nodes - print('\n4. Getting cluster nodes...') - get_cluster_nodes_example(cluster_id) - - # Scale the cluster - print('\n5. Scaling the cluster...') - scale_cluster_example(cluster_id, 5) - # Delete the cluster print('\n6. Deleting the cluster...') delete_cluster_example(cluster_id) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 3f695ab..78a3260 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -9,7 +9,7 @@ Make sure to run the server and the account has enough balance before running the tests """ -BASE_URL = 'http://localhost:3010/v1' +BASE_URL = os.getenv('VERDA_BASE_URL', 'https://api.verda.com/v1') # Load env variables, make sure there's an env file with valid client credentials load_dotenv() diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py new file mode 100644 index 0000000..904f711 --- /dev/null +++ b/tests/integration_tests/test_clusters.py @@ -0,0 +1,42 @@ +import os + +import pytest + +from verda import VerdaClient +from verda.constants import Locations + +IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true' + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Test doesn't work in Github Actions.") +@pytest.mark.withoutresponses +class TestClusters: + def test_create_cluster(self, verda_client: VerdaClient): + # get ssh key + ssh_key = verda_client.ssh_keys.get()[0] + + # create instance + cluster = verda_client.clusters.create( + hostname='test-instance', + location=Locations.FIN_03, + cluster_type='16H200', + description='test instance', + image='ubuntu-22.04-cuda-12.4-cluster', + ssh_key_ids=[ssh_key.id], + ) + + # assert instance is created + assert cluster.id is not None + assert ( + cluster.status == verda_client.constants.instance_status.PROVISIONING + or cluster.status == verda_client.constants.instance_status.RUNNING + ) + + assert cluster.worker_nodes is not None + assert len(cluster.worker_nodes) == 2 + assert cluster.ip is not None + + print(cluster) + + # delete instance + # verda_client.clusters.action(cluster.id, 'delete') diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py index ff07e58..7ffe932 100644 --- a/verda/clusters/__init__.py +++ b/verda/clusters/__init__.py @@ -1,5 +1,5 @@ """Clusters service for managing compute clusters.""" -from verda.clusters._clusters import Cluster, ClusterNode, ClustersService +from verda.clusters._clusters import Cluster, ClusterWorkerNode, ClustersService -__all__ = ['Cluster', 'ClusterNode', 'ClustersService'] +__all__ = ['Cluster', 'ClusterWorkerNode', 'ClustersService'] diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index f0010d7..c0031c3 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -5,7 +5,7 @@ from dataclasses_json import dataclass_json -from verda.constants import Locations +from verda.constants import Actions, Locations CLUSTERS_ENDPOINT = '/clusters' @@ -16,24 +16,20 @@ @dataclass_json @dataclass -class ClusterNode: - """Represents a node in a cluster. +class ClusterWorkerNode: + """Represents a worker node in a cluster. Attributes: id: Unique identifier for the node (instance ID). - instance_type: Type of the instance for this node. status: Current status of the node. hostname: Network hostname of the node. - ip: IP address of the node. - created_at: Timestamp of node creation. + private_ip: Private IP address of the node. """ id: str - instance_type: str status: str hostname: str - ip: str | None = None - created_at: str | None = None + private_ip: str @dataclass_json @@ -43,35 +39,32 @@ class Cluster: Attributes: id: Unique identifier for the cluster. - name: Human-readable name of the cluster. + hostname: Human-readable hostname of the cluster. description: Description of the cluster. status: Current operational status of the cluster. created_at: Timestamp of cluster creation. location: Datacenter location code (default: Locations.FIN_03). - instance_type: Type of instances used for cluster nodes. - node_count: Number of nodes in the cluster. - nodes: List of nodes in the cluster. + cluster_type: Type of instances used for cluster nodes. + worker_nodes: List of nodes in the cluster. ssh_key_ids: List of SSH key IDs associated with the cluster nodes. image: Image ID or type used for cluster nodes. startup_script_id: ID of the startup script to run on nodes. - master_ip: IP address of the cluster master/coordinator node. - endpoint: Cluster access endpoint. + public_ip: IP address of the jumphost. """ id: str - name: str + hostname: str description: str status: str created_at: str location: str - instance_type: str + cluster_type: str node_count: int - nodes: list[ClusterNode] + worker_nodes: list[ClusterWorkerNode] ssh_key_ids: list[str] image: str | None = None startup_script_id: str | None = None - master_ip: str | None = None - endpoint: str | None = None + ip: str | None = None class ClustersService: @@ -120,16 +113,17 @@ def get_by_id(self, id: str) -> Cluster: def create( self, - name: str, - instance_type: str, - node_count: int, + hostname: str, + cluster_type: str, image: str, description: str, ssh_key_ids: list = [], location: str = Locations.FIN_03, startup_script_id: str | None = None, + shared_volume_name: str | None = None, + shared_volume_size: int | None = None, *, - max_wait_time: float = 300, + max_wait_time: float = 900, initial_interval: float = 1.0, max_interval: float = 10, backoff_coefficient: float = 2.0, @@ -138,14 +132,15 @@ def create( Args: name: Name for the cluster. - instance_type: Type of instances to use for cluster nodes (e.g., '8V100.48V'). - node_count: Number of nodes to create in the cluster. + cluster_type: Cluster type. image: Image type or ID for cluster nodes. description: Human-readable description of the cluster. ssh_key_ids: List of SSH key IDs to associate with cluster nodes. location: Datacenter location code (default: Locations.FIN_03). startup_script_id: Optional ID of startup script to run on nodes. - max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 300) + shared_volume_name: Optional name for the shared volume. + shared_volume_size: Optional size for the shared volume, in GB, default to 30TB. + max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 900) initial_interval: Initial interval, in seconds (default: 1.0) max_interval: The longest single delay allowed between retries, in seconds (default: 10) backoff_coefficient: Coefficient to calculate the next retry interval (default 2.0) @@ -158,16 +153,21 @@ def create( TimeoutError: If cluster does not start creating within max_wait_time. """ payload = { - 'name': name, - 'instance_type': instance_type, - 'node_count': node_count, + 'hostname': hostname, + 'cluster_type': cluster_type, 'image': image, 'description': description, 'ssh_key_ids': ssh_key_ids, + 'contract': 'PAY_AS_YOU_GO', 'location_code': location, 'startup_script_id': startup_script_id, + 'shared_volume': { + 'name': shared_volume_name if shared_volume_name else hostname + '-shared-volume', + 'size': shared_volume_size if shared_volume_size else 30000, + }, } - id = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).text + response = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).json() + id = response['id'] # Wait for cluster to enter creating state with timeout deadline = time.monotonic() + max_wait_time @@ -185,50 +185,54 @@ def create( interval = min(initial_interval * backoff_coefficient**i, max_interval, deadline - now) time.sleep(interval) - def delete(self, id: str) -> None: - """Deletes a cluster and all its nodes. + def action(self, id_list: list[str] | str, action: str) -> None: + """Performs an action on one or more instances. Args: - id: Unique identifier of the cluster to delete. + id_list: Single instance ID or list of instance IDs to act upon. + action: Action to perform on the clusters. Only `delete` is supported. Raises: - HTTPError: If the deletion fails or other API error occurs. + HTTPError: If the action fails or other API error occurs. """ - self._http_client.delete(CLUSTERS_ENDPOINT + f'/{id}') + if type(id_list) is str: + id_list = [id_list] + + if action == Actions.DELETE: + payload = {'id': id_list, 'action': 'discontinue'} + else: + raise ValueError(f'Invalid action: {action}. Only DELETE is supported.') + + self._http_client.put(CLUSTERS_ENDPOINT, json=payload) return - def scale( + def is_available( self, - id: str, - node_count: int, - ) -> Cluster: - """Scales a cluster to the specified number of nodes. + cluster_type: str, + location_code: str | None = None, + ) -> bool: + """Checks if a specific instance type is available for deployment. Args: - id: Unique identifier of the cluster to scale. - node_count: Target number of nodes for the cluster. + cluster_type: Type of instance to check availability for. + location_code: Optional datacenter location code. Returns: - Updated cluster object. - - Raises: - HTTPError: If the scaling fails or other API error occurs. + True if the instance type is available, False otherwise. """ - payload = {'node_count': node_count} - self._http_client.put(CLUSTERS_ENDPOINT + f'/{id}/scale', json=payload) - return self.get_by_id(id) + is_spot = str(is_spot).lower() + query_params = {'location_code': location_code} + url = f'/cluster-availability/{cluster_type}' + return self._http_client.get(url, query_params).json() - def get_nodes(self, id: str) -> list[ClusterNode]: - """Retrieves all nodes in a cluster. + def get_availabilities(self, location_code: str | None = None) -> list[dict]: + """Retrieves a list of available cluster types across locations. Args: - id: Unique identifier of the cluster. + location_code: Optional datacenter location code to filter by. Returns: - List of nodes in the cluster. - - Raises: - HTTPError: If the cluster is not found or other API error occurs. + List of available cluster types and their details. """ - nodes_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}/nodes').json() - return [ClusterNode.from_dict(node_dict, infer_missing=True) for node_dict in nodes_dict] + query_params = {'location_code': location_code} + return self._http_client.get('/cluster-availability', params=query_params).json() From 814d02eed449f02e4cebc87371eed889b5238243 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Fri, 19 Dec 2025 17:47:33 +0200 Subject: [PATCH 03/10] fix: unit test --- tests/unit_tests/clusters/test_clusters.py | 226 +++------------------ 1 file changed, 27 insertions(+), 199 deletions(-) diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py index 9211fc4..71d7e03 100644 --- a/tests/unit_tests/clusters/test_clusters.py +++ b/tests/unit_tests/clusters/test_clusters.py @@ -1,73 +1,57 @@ import pytest import responses # https://github.com/getsentry/responses -from verda.clusters import Cluster, ClusterNode, ClustersService +from verda.clusters import Cluster, ClusterWorkerNode, ClustersService from verda.constants import ErrorCodes, Locations from verda.exceptions import APIException INVALID_REQUEST = ErrorCodes.INVALID_REQUEST -INVALID_REQUEST_MESSAGE = 'Invalid cluster request' +INVALID_REQUEST_MESSAGE = 'Invalid request' CLUSTER_ID = 'deadc0de-a5d2-4972-ae4e-d429115d055b' SSH_KEY_ID = '12345dc1-a5d2-4972-ae4e-d429115d055b' -CLUSTER_NAME = 'test-cluster' +CLUSTER_HOSTNAME = 'test-cluster' CLUSTER_DESCRIPTION = 'Test compute cluster' CLUSTER_STATUS = 'running' -CLUSTER_INSTANCE_TYPE = '8V100.48V' -CLUSTER_NODE_COUNT = 3 -CLUSTER_LOCATION = Locations.FIN_01 -CLUSTER_IMAGE = 'ubuntu-24.04-cuda-12.8-open-docker' +CLUSTER_CLUSTER_TYPE = '16H200' +CLUSTER_NODE_COUNT = 2 +CLUSTER_LOCATION = Locations.FIN_03 +CLUSTER_IMAGE = 'ubuntu-22.04-cuda-12.4-cluster' CLUSTER_CREATED_AT = '2024-01-01T00:00:00Z' -CLUSTER_MASTER_IP = '10.0.0.1' -CLUSTER_ENDPOINT = 'cluster-endpoint.verda.com' +CLUSTER_IP = '10.0.0.1' NODE_1_ID = 'node1-c0de-a5d2-4972-ae4e-d429115d055b' NODE_2_ID = 'node2-c0de-a5d2-4972-ae4e-d429115d055b' -NODE_3_ID = 'node3-c0de-a5d2-4972-ae4e-d429115d055b' NODES_PAYLOAD = [ { 'id': NODE_1_ID, - 'instance_type': CLUSTER_INSTANCE_TYPE, 'status': 'running', 'hostname': 'test-cluster-node-1', - 'ip': '10.0.0.2', - 'created_at': CLUSTER_CREATED_AT, + 'private_ip': '10.0.0.1', }, { 'id': NODE_2_ID, - 'instance_type': CLUSTER_INSTANCE_TYPE, 'status': 'running', 'hostname': 'test-cluster-node-2', - 'ip': '10.0.0.3', - 'created_at': CLUSTER_CREATED_AT, - }, - { - 'id': NODE_3_ID, - 'instance_type': CLUSTER_INSTANCE_TYPE, - 'status': 'running', - 'hostname': 'test-cluster-node-3', - 'ip': '10.0.0.4', - 'created_at': CLUSTER_CREATED_AT, + 'private_ip': '10.0.0.2', }, ] CLUSTER_PAYLOAD = [ { 'id': CLUSTER_ID, - 'name': CLUSTER_NAME, + 'hostname': CLUSTER_HOSTNAME, 'description': CLUSTER_DESCRIPTION, 'status': CLUSTER_STATUS, 'created_at': CLUSTER_CREATED_AT, 'location': CLUSTER_LOCATION, - 'instance_type': CLUSTER_INSTANCE_TYPE, - 'node_count': CLUSTER_NODE_COUNT, - 'nodes': NODES_PAYLOAD, + 'cluster_type': CLUSTER_CLUSTER_TYPE, + 'worker_nodes': NODES_PAYLOAD, 'ssh_key_ids': [SSH_KEY_ID], 'image': CLUSTER_IMAGE, - 'master_ip': CLUSTER_MASTER_IP, - 'endpoint': CLUSTER_ENDPOINT, + 'ip': CLUSTER_IP, } ] @@ -94,95 +78,20 @@ def test_get_clusters(self, clusters_service, endpoint): assert len(clusters) == 1 assert isinstance(cluster, Cluster) assert cluster.id == CLUSTER_ID - assert cluster.name == CLUSTER_NAME + assert cluster.hostname == CLUSTER_HOSTNAME assert cluster.description == CLUSTER_DESCRIPTION assert cluster.status == CLUSTER_STATUS assert cluster.created_at == CLUSTER_CREATED_AT assert cluster.location == CLUSTER_LOCATION - assert cluster.instance_type == CLUSTER_INSTANCE_TYPE - assert cluster.node_count == CLUSTER_NODE_COUNT - assert isinstance(cluster.nodes, list) - assert len(cluster.nodes) == CLUSTER_NODE_COUNT - assert isinstance(cluster.nodes[0], ClusterNode) + assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE + assert isinstance(cluster.worker_nodes, list) + assert len(cluster.worker_nodes) == CLUSTER_NODE_COUNT + assert isinstance(cluster.worker_nodes[0], ClusterWorkerNode) assert cluster.ssh_key_ids == [SSH_KEY_ID] assert cluster.image == CLUSTER_IMAGE - assert cluster.master_ip == CLUSTER_MASTER_IP - assert cluster.endpoint == CLUSTER_ENDPOINT + assert cluster.ip == CLUSTER_IP assert responses.assert_call_count(endpoint, 1) is True - def test_get_clusters_by_status_successful(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '?status=running' - responses.add(responses.GET, url, json=CLUSTER_PAYLOAD, status=200) - - # act - clusters = clusters_service.get(status='running') - cluster = clusters[0] - - # assert - assert isinstance(clusters, list) - assert len(clusters) == 1 - assert isinstance(cluster, Cluster) - assert cluster.id == CLUSTER_ID - assert cluster.status == CLUSTER_STATUS - assert responses.assert_call_count(url, 1) is True - - def test_get_clusters_by_status_failed(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '?status=invalid_status' - responses.add( - responses.GET, - url, - json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, - status=400, - ) - - # act - with pytest.raises(APIException) as excinfo: - clusters_service.get(status='invalid_status') - - # assert - assert excinfo.value.code == INVALID_REQUEST - assert excinfo.value.message == INVALID_REQUEST_MESSAGE - assert responses.assert_call_count(url, 1) is True - - def test_get_cluster_by_id_successful(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '/' + CLUSTER_ID - responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200) - - # act - cluster = clusters_service.get_by_id(CLUSTER_ID) - - # assert - assert isinstance(cluster, Cluster) - assert cluster.id == CLUSTER_ID - assert cluster.name == CLUSTER_NAME - assert cluster.description == CLUSTER_DESCRIPTION - assert cluster.status == CLUSTER_STATUS - assert cluster.instance_type == CLUSTER_INSTANCE_TYPE - assert cluster.node_count == CLUSTER_NODE_COUNT - assert responses.assert_call_count(url, 1) is True - - def test_get_cluster_by_id_failed(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '/invalid_id' - responses.add( - responses.GET, - url, - json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, - status=400, - ) - - # act - with pytest.raises(APIException) as excinfo: - clusters_service.get_by_id('invalid_id') - - # assert - assert excinfo.value.code == INVALID_REQUEST - assert excinfo.value.message == INVALID_REQUEST_MESSAGE - assert responses.assert_call_count(url, 1) is True - def test_create_cluster_successful(self, clusters_service, endpoint): # arrange - add response mock # create cluster @@ -193,9 +102,8 @@ def test_create_cluster_successful(self, clusters_service, endpoint): # act cluster = clusters_service.create( - name=CLUSTER_NAME, - instance_type=CLUSTER_INSTANCE_TYPE, - node_count=CLUSTER_NODE_COUNT, + hostname=CLUSTER_HOSTNAME, + cluster_type=CLUSTER_CLUSTER_TYPE, image=CLUSTER_IMAGE, description=CLUSTER_DESCRIPTION, ssh_key_ids=[SSH_KEY_ID], @@ -205,10 +113,10 @@ def test_create_cluster_successful(self, clusters_service, endpoint): # assert assert isinstance(cluster, Cluster) assert cluster.id == CLUSTER_ID - assert cluster.name == CLUSTER_NAME + assert cluster.hostname == CLUSTER_HOSTNAME assert cluster.description == CLUSTER_DESCRIPTION assert cluster.status == CLUSTER_STATUS - assert cluster.instance_type == CLUSTER_INSTANCE_TYPE + assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE assert cluster.node_count == CLUSTER_NODE_COUNT assert cluster.ssh_key_ids == [SSH_KEY_ID] assert cluster.location == CLUSTER_LOCATION @@ -228,8 +136,8 @@ def test_create_cluster_failed(self, clusters_service, endpoint): # act with pytest.raises(APIException) as excinfo: clusters_service.create( - name=CLUSTER_NAME, - instance_type=CLUSTER_INSTANCE_TYPE, + name=CLUSTER_HOSTNAME, + cluster_type=CLUSTER_CLUSTER_TYPE, node_count=CLUSTER_NODE_COUNT, image=CLUSTER_IMAGE, description=CLUSTER_DESCRIPTION, @@ -246,7 +154,7 @@ def test_delete_cluster_successful(self, clusters_service, endpoint): responses.add(responses.DELETE, url, status=202) # act - result = clusters_service.delete(CLUSTER_ID) + result = clusters_service.action(CLUSTER_ID, 'delete') # assert assert result is None @@ -271,83 +179,3 @@ def test_delete_cluster_failed(self, clusters_service, endpoint): assert excinfo.value.message == INVALID_REQUEST_MESSAGE assert responses.assert_call_count(url, 1) is True - def test_scale_cluster_successful(self, clusters_service, endpoint): - # arrange - add response mock - new_node_count = 5 - scaled_payload = CLUSTER_PAYLOAD[0].copy() - scaled_payload['node_count'] = new_node_count - - # scale endpoint - scale_url = endpoint + '/' + CLUSTER_ID + '/scale' - responses.add(responses.PUT, scale_url, status=200) - - # get cluster by id - get_url = endpoint + '/' + CLUSTER_ID - responses.add(responses.GET, get_url, json=scaled_payload, status=200) - - # act - cluster = clusters_service.scale(CLUSTER_ID, new_node_count) - - # assert - assert isinstance(cluster, Cluster) - assert cluster.id == CLUSTER_ID - assert cluster.node_count == new_node_count - assert responses.assert_call_count(scale_url, 1) is True - assert responses.assert_call_count(get_url, 1) is True - - def test_scale_cluster_failed(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '/' + CLUSTER_ID + '/scale' - responses.add( - responses.PUT, - url, - json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, - status=400, - ) - - # act - with pytest.raises(APIException) as excinfo: - clusters_service.scale(CLUSTER_ID, 5) - - # assert - assert excinfo.value.code == INVALID_REQUEST - assert excinfo.value.message == INVALID_REQUEST_MESSAGE - assert responses.assert_call_count(url, 1) is True - - def test_get_cluster_nodes_successful(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '/' + CLUSTER_ID + '/nodes' - responses.add(responses.GET, url, json=NODES_PAYLOAD, status=200) - - # act - nodes = clusters_service.get_nodes(CLUSTER_ID) - - # assert - assert isinstance(nodes, list) - assert len(nodes) == CLUSTER_NODE_COUNT - assert isinstance(nodes[0], ClusterNode) - assert nodes[0].id == NODE_1_ID - assert nodes[0].instance_type == CLUSTER_INSTANCE_TYPE - assert nodes[0].status == 'running' - assert nodes[1].id == NODE_2_ID - assert nodes[2].id == NODE_3_ID - assert responses.assert_call_count(url, 1) is True - - def test_get_cluster_nodes_failed(self, clusters_service, endpoint): - # arrange - add response mock - url = endpoint + '/invalid_id/nodes' - responses.add( - responses.GET, - url, - json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, - status=400, - ) - - # act - with pytest.raises(APIException) as excinfo: - clusters_service.get_nodes('invalid_id') - - # assert - assert excinfo.value.code == INVALID_REQUEST - assert excinfo.value.message == INVALID_REQUEST_MESSAGE - assert responses.assert_call_count(url, 1) is True From 99668e8c7d35ce7083deddc7c6c9d5bb34162afc Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Fri, 19 Dec 2025 17:55:03 +0200 Subject: [PATCH 04/10] fix: polishing --- verda/clusters/_clusters.py | 6 +----- verda/constants.py | 7 ++----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index c0031c3..1f1011d 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -9,9 +9,6 @@ CLUSTERS_ENDPOINT = '/clusters' -ClusterStatus = Literal[ - 'creating', 'running', 'scaling', 'updating', 'deleting', 'deleted', 'error' -] @dataclass_json @@ -131,7 +128,7 @@ def create( """Creates and deploys a new compute cluster. Args: - name: Name for the cluster. + hostname: Name for the cluster. cluster_type: Cluster type. image: Image type or ID for cluster nodes. description: Human-readable description of the cluster. @@ -220,7 +217,6 @@ def is_available( Returns: True if the instance type is available, False otherwise. """ - is_spot = str(is_spot).lower() query_params = {'location_code': location_code} url = f'/cluster-availability/{cluster_type}' return self._http_client.get(url, query_params).json() diff --git a/verda/constants.py b/verda/constants.py index a339a42..777e7a7 100644 --- a/verda/constants.py +++ b/verda/constants.py @@ -60,12 +60,9 @@ class ClusterStatus: """Cluster status.""" ORDERED = 'ordered' - CREATING = 'creating' + PROVISIONING = 'provisioning' RUNNING = 'running' - SCALING = 'scaling' - UPDATING = 'updating' - DELETING = 'deleting' - DELETED = 'deleted' + DISCONTINUED = 'discontinued' ERROR = 'error' def __init__(self): From 7f86615c1cd8117e79e1a7dac5d8939329d67329 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 11:08:13 +0200 Subject: [PATCH 05/10] fix: format, lint and unit test fixing --- tests/integration_tests/conftest.py | 2 +- tests/unit_tests/clusters/test_clusters.py | 25 +++++++++++----------- verda/clusters/__init__.py | 2 +- verda/clusters/_clusters.py | 10 ++++++++- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 78a3260..3f695ab 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -9,7 +9,7 @@ Make sure to run the server and the account has enough balance before running the tests """ -BASE_URL = os.getenv('VERDA_BASE_URL', 'https://api.verda.com/v1') +BASE_URL = 'http://localhost:3010/v1' # Load env variables, make sure there's an env file with valid client credentials load_dotenv() diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py index 71d7e03..d080259 100644 --- a/tests/unit_tests/clusters/test_clusters.py +++ b/tests/unit_tests/clusters/test_clusters.py @@ -1,7 +1,7 @@ import pytest import responses # https://github.com/getsentry/responses -from verda.clusters import Cluster, ClusterWorkerNode, ClustersService +from verda.clusters import Cluster, ClustersService, ClusterWorkerNode from verda.constants import ErrorCodes, Locations from verda.exceptions import APIException @@ -95,7 +95,7 @@ def test_get_clusters(self, clusters_service, endpoint): def test_create_cluster_successful(self, clusters_service, endpoint): # arrange - add response mock # create cluster - responses.add(responses.POST, endpoint, body=CLUSTER_ID, status=200) + responses.add(responses.POST, endpoint, json={'id': CLUSTER_ID}, status=200) # get cluster by id url = endpoint + '/' + CLUSTER_ID responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200) @@ -117,7 +117,7 @@ def test_create_cluster_successful(self, clusters_service, endpoint): assert cluster.description == CLUSTER_DESCRIPTION assert cluster.status == CLUSTER_STATUS assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE - assert cluster.node_count == CLUSTER_NODE_COUNT + assert len(cluster.worker_nodes) == CLUSTER_NODE_COUNT assert cluster.ssh_key_ids == [SSH_KEY_ID] assert cluster.location == CLUSTER_LOCATION assert cluster.image == CLUSTER_IMAGE @@ -136,11 +136,12 @@ def test_create_cluster_failed(self, clusters_service, endpoint): # act with pytest.raises(APIException) as excinfo: clusters_service.create( - name=CLUSTER_HOSTNAME, + hostname=CLUSTER_HOSTNAME, cluster_type=CLUSTER_CLUSTER_TYPE, - node_count=CLUSTER_NODE_COUNT, image=CLUSTER_IMAGE, description=CLUSTER_DESCRIPTION, + ssh_key_ids=[SSH_KEY_ID], + location=CLUSTER_LOCATION, ) # assert @@ -150,11 +151,11 @@ def test_create_cluster_failed(self, clusters_service, endpoint): def test_delete_cluster_successful(self, clusters_service, endpoint): # arrange - add response mock - url = endpoint + '/' + CLUSTER_ID - responses.add(responses.DELETE, url, status=202) + url = endpoint + responses.add(responses.PUT, url, status=202) # act - result = clusters_service.action(CLUSTER_ID, 'delete') + result = clusters_service.delete(CLUSTER_ID) # assert assert result is None @@ -162,10 +163,9 @@ def test_delete_cluster_successful(self, clusters_service, endpoint): def test_delete_cluster_failed(self, clusters_service, endpoint): # arrange - add response mock - url = endpoint + '/invalid_id' responses.add( - responses.DELETE, - url, + responses.PUT, + endpoint, json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE}, status=400, ) @@ -177,5 +177,4 @@ def test_delete_cluster_failed(self, clusters_service, endpoint): # assert assert excinfo.value.code == INVALID_REQUEST assert excinfo.value.message == INVALID_REQUEST_MESSAGE - assert responses.assert_call_count(url, 1) is True - + assert responses.assert_call_count(endpoint, 1) is True diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py index 7ffe932..849bf71 100644 --- a/verda/clusters/__init__.py +++ b/verda/clusters/__init__.py @@ -1,5 +1,5 @@ """Clusters service for managing compute clusters.""" -from verda.clusters._clusters import Cluster, ClusterWorkerNode, ClustersService +from verda.clusters._clusters import Cluster, ClustersService, ClusterWorkerNode __all__ = ['Cluster', 'ClusterWorkerNode', 'ClustersService'] diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index 1f1011d..6fe7037 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -10,7 +10,6 @@ CLUSTERS_ENDPOINT = '/clusters' - @dataclass_json @dataclass class ClusterWorkerNode: @@ -203,6 +202,15 @@ def action(self, id_list: list[str] | str, action: str) -> None: self._http_client.put(CLUSTERS_ENDPOINT, json=payload) return + def delete(self, cluster_id: str) -> None: + """Deletes a cluster. + + Args: + cluster_id: ID of the cluster to delete. + """ + self.action(cluster_id, 'delete') + return + def is_available( self, cluster_type: str, From 471e0894385bf51dab228326c4af5b16c533d34a Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 11:59:24 +0200 Subject: [PATCH 06/10] fix: integration tests --- tests/integration_tests/conftest.py | 3 +-- tests/integration_tests/test_clusters.py | 12 +++++++----- verda/clusters/_clusters.py | 1 - 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 3f695ab..28d36fb 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -9,12 +9,11 @@ Make sure to run the server and the account has enough balance before running the tests """ -BASE_URL = 'http://localhost:3010/v1' - # Load env variables, make sure there's an env file with valid client credentials load_dotenv() CLIENT_SECRET = os.getenv('VERDA_CLIENT_SECRET') CLIENT_ID = os.getenv('VERDA_CLIENT_ID') +BASE_URL = os.getenv('VERDA_BASE_URL', 'http://localhost:3010/v1') @pytest.fixture diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py index 904f711..3e098af 100644 --- a/tests/integration_tests/test_clusters.py +++ b/tests/integration_tests/test_clusters.py @@ -19,9 +19,9 @@ def test_create_cluster(self, verda_client: VerdaClient): cluster = verda_client.clusters.create( hostname='test-instance', location=Locations.FIN_03, - cluster_type='16H200', + cluster_type='16B200', description='test instance', - image='ubuntu-22.04-cuda-12.4-cluster', + image='ubuntu-22.04-cuda-12.8-cluster', ssh_key_ids=[ssh_key.id], ) @@ -32,9 +32,11 @@ def test_create_cluster(self, verda_client: VerdaClient): or cluster.status == verda_client.constants.instance_status.RUNNING ) - assert cluster.worker_nodes is not None - assert len(cluster.worker_nodes) == 2 - assert cluster.ip is not None + # If still provisioning, we don't have worker nodes yet and ip is not available + if cluster.status != verda_client.constants.instance_status.PROVISIONING: + assert cluster.worker_nodes is not None + assert len(cluster.worker_nodes) == 2 + assert cluster.ip is not None print(cluster) diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index 6fe7037..7ba684b 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -55,7 +55,6 @@ class Cluster: created_at: str location: str cluster_type: str - node_count: int worker_nodes: list[ClusterWorkerNode] ssh_key_ids: list[str] image: str | None = None From f00901096baea33a813a0ec5b1cd170b3a259f68 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 17:00:02 +0200 Subject: [PATCH 07/10] fix: review fixes --- examples/clusters_example.py | 18 ++++-- tests/integration_tests/test_clusters.py | 36 ++++++++++-- verda/clusters/_clusters.py | 74 +++++++++++++++++++----- 3 files changed, 104 insertions(+), 24 deletions(-) diff --git a/examples/clusters_example.py b/examples/clusters_example.py index b0efdea..8f7696e 100644 --- a/examples/clusters_example.py +++ b/examples/clusters_example.py @@ -28,11 +28,20 @@ def create_cluster_example(): # Get SSH keys ssh_keys = [key.id for key in verda.ssh_keys.get()] - # Create a cluster with 3 nodes + # Check if cluster type is available + if not verda.clusters.is_available('16B200', Locations.FIN_03): + raise ValueError('Cluster type 16B200 is not available in FIN_03') + + # Get available images for cluster type + images = verda.clusters.get_cluster_images('16B200') + if 'ubuntu-22.04-cuda-12.9-cluster' not in images: + raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200') + + # Create a 16B200 cluster cluster = verda.clusters.create( hostname='my-compute-cluster', - cluster_type='16H200', - image='ubuntu-22.04-cuda-12.4-cluster', + cluster_type='16B200', + image='ubuntu-22.04-cuda-12.9-cluster', description='Example compute cluster for distributed training', ssh_key_ids=ssh_keys, location=Locations.FIN_03, @@ -40,11 +49,10 @@ def create_cluster_example(): shared_volume_size=30000, ) - print(f'Created cluster: {cluster.id}') + print(f'Creating cluster: {cluster.id}') print(f'Cluster hostname: {cluster.hostname}') print(f'Cluster status: {cluster.status}') print(f'Cluster cluster_type: {cluster.cluster_type}') - print(f'Cluster worker_nodes: {cluster.worker_nodes}') print(f'Location: {cluster.location}') return cluster diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py index 3e098af..ec11e0c 100644 --- a/tests/integration_tests/test_clusters.py +++ b/tests/integration_tests/test_clusters.py @@ -1,3 +1,4 @@ +import logging import os import pytest @@ -5,6 +6,10 @@ from verda import VerdaClient from verda.constants import Locations +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true' @@ -15,21 +20,40 @@ def test_create_cluster(self, verda_client: VerdaClient): # get ssh key ssh_key = verda_client.ssh_keys.get()[0] + if not verda_client.clusters.is_available('16B200', Locations.FIN_03): + raise ValueError('Cluster type 16B200 is not available in FIN_03') + logger.debug('[x] Cluster type 16B200 is available in FIN_03') + + availabilities = verda_client.clusters.get_availabilities(Locations.FIN_03) + assert len(availabilities) > 0 + assert '16B200' in availabilities + logger.debug( + '[x] Cluster type 16B200 is one of the available cluster types in FIN_03: %s', + availabilities, + ) + + images = verda_client.clusters.get_cluster_images('16B200') + assert len(images) > 0 + assert 'ubuntu-22.04-cuda-12.9-cluster' in images + logger.debug('[x] Ubuntu 22.04 CUDA 12.9 cluster image is supported for 16B200') + # create instance cluster = verda_client.clusters.create( hostname='test-instance', location=Locations.FIN_03, cluster_type='16B200', description='test instance', - image='ubuntu-22.04-cuda-12.8-cluster', + image='ubuntu-22.04-cuda-12.9-cluster', ssh_key_ids=[ssh_key.id], + # Set to None to not wait for provisioning but return immediately + wait_for_status=verda_client.constants.cluster_status.PROVISIONING, ) # assert instance is created assert cluster.id is not None assert ( - cluster.status == verda_client.constants.instance_status.PROVISIONING - or cluster.status == verda_client.constants.instance_status.RUNNING + cluster.status == verda_client.constants.cluster_status.PROVISIONING + or cluster.status == verda_client.constants.cluster_status.RUNNING ) # If still provisioning, we don't have worker nodes yet and ip is not available @@ -38,7 +62,11 @@ def test_create_cluster(self, verda_client: VerdaClient): assert len(cluster.worker_nodes) == 2 assert cluster.ip is not None - print(cluster) + print(f'Creating cluster: {cluster.id}') + print(f'Cluster hostname: {cluster.hostname}') + print(f'Cluster status: {cluster.status}') + print(f'Cluster cluster_type: {cluster.cluster_type}') + print(f'Location: {cluster.location}') # delete instance # verda_client.clusters.action(cluster.id, 'delete') diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index 7ba684b..1c6cc97 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -1,14 +1,17 @@ import itertools import time from dataclasses import dataclass -from typing import Literal from dataclasses_json import dataclass_json -from verda.constants import Actions, Locations +from verda.constants import Actions, ClusterStatus, ErrorCodes, Locations +from verda.exceptions import APIException CLUSTERS_ENDPOINT = '/clusters' +# Default shared volume size is 30TB +DEFAULT_SHARED_VOLUME_SIZE = 30000 + @dataclass_json @dataclass @@ -16,7 +19,7 @@ class ClusterWorkerNode: """Represents a worker node in a cluster. Attributes: - id: Unique identifier for the node (instance ID). + id: Unique identifier for the node. status: Current status of the node. hostname: Network hostname of the node. private_ip: Private IP address of the node. @@ -40,7 +43,7 @@ class Cluster: status: Current operational status of the cluster. created_at: Timestamp of cluster creation. location: Datacenter location code (default: Locations.FIN_03). - cluster_type: Type of instances used for cluster nodes. + cluster_type: Type of the cluster. worker_nodes: List of nodes in the cluster. ssh_key_ids: List of SSH key IDs associated with the cluster nodes. image: Image ID or type used for cluster nodes. @@ -65,7 +68,7 @@ class Cluster: class ClustersService: """Service for managing compute clusters through the API. - This service provides methods to create, retrieve, scale, and manage compute clusters. + This service provides methods to create, retrieve, and manage compute clusters. """ def __init__(self, http_client) -> None: @@ -118,6 +121,7 @@ def create( shared_volume_name: str | None = None, shared_volume_size: int | None = None, *, + wait_for_status: str | None = ClusterStatus.PROVISIONING, max_wait_time: float = 900, initial_interval: float = 1.0, max_interval: float = 10, @@ -135,6 +139,7 @@ def create( startup_script_id: Optional ID of startup script to run on nodes. shared_volume_name: Optional name for the shared volume. shared_volume_size: Optional size for the shared volume, in GB, default to 30TB. + wait_for_status: Status to wait for the cluster to reach, default to PROVISIONING. If None, no wait is performed. max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 900) initial_interval: Initial interval, in seconds (default: 1.0) max_interval: The longest single delay allowed between retries, in seconds (default: 10) @@ -158,19 +163,28 @@ def create( 'startup_script_id': startup_script_id, 'shared_volume': { 'name': shared_volume_name if shared_volume_name else hostname + '-shared-volume', - 'size': shared_volume_size if shared_volume_size else 30000, + 'size': shared_volume_size if shared_volume_size else DEFAULT_SHARED_VOLUME_SIZE, }, } response = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).json() id = response['id'] + if not wait_for_status: + return self.get_by_id(id) + # Wait for cluster to enter creating state with timeout deadline = time.monotonic() + max_wait_time for i in itertools.count(): cluster = self.get_by_id(id) - if cluster.status != 'ordered': + if cluster.status == wait_for_status: return cluster + if cluster.status == ClusterStatus.ERROR: + raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} entered error state') + + if cluster.status == ClusterStatus.DISCONTINUED: + raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} was discontinued') + now = time.monotonic() if now >= deadline: raise TimeoutError( @@ -181,10 +195,10 @@ def create( time.sleep(interval) def action(self, id_list: list[str] | str, action: str) -> None: - """Performs an action on one or more instances. + """Performs an action on one or more clusters. Args: - id_list: Single instance ID or list of instance IDs to act upon. + id_list: Single cluster ID or list of cluster IDs to act upon. action: Action to perform on the clusters. Only `delete` is supported. Raises: @@ -215,20 +229,21 @@ def is_available( cluster_type: str, location_code: str | None = None, ) -> bool: - """Checks if a specific instance type is available for deployment. + """Checks if a specific cluster type is available for deployment. Args: - cluster_type: Type of instance to check availability for. + cluster_type: Type of cluster to check availability for. location_code: Optional datacenter location code. Returns: - True if the instance type is available, False otherwise. + True if the cluster type is available, False otherwise. """ query_params = {'location_code': location_code} url = f'/cluster-availability/{cluster_type}' - return self._http_client.get(url, query_params).json() + response = self._http_client.get(url, query_params).text + return response == 'true' - def get_availabilities(self, location_code: str | None = None) -> list[dict]: + def get_availabilities(self, location_code: str | None = None) -> list[str]: """Retrieves a list of available cluster types across locations. Args: @@ -238,4 +253,33 @@ def get_availabilities(self, location_code: str | None = None) -> list[dict]: List of available cluster types and their details. """ query_params = {'location_code': location_code} - return self._http_client.get('/cluster-availability', params=query_params).json() + response = self._http_client.get('/cluster-availability', params=query_params).json() + availabilities = response[0]['availabilities'] + return availabilities + + def get_availability(self, cluster_type: str, location_code: str | None = None) -> list[dict]: + """Checks if a specific cluster type is available for deployment. + + Args: + cluster_type: Type of cluster to check availability for. + location_code: Optional datacenter location code. + + Returns: + True if the cluster type is available, False otherwise. + """ + + def get_cluster_images( + self, + cluster_type: str | None = None, + ) -> list[str]: + """Retrieves a list of available images for a given cluster type (optional). + + Args: + cluster_type: Type of cluster to get images for. + + Returns: + List of available images for the given cluster type. + """ + query_params = {'instance_type': cluster_type} + images = self._http_client.get('/images/cluster', params=query_params).json() + return [image['image_type'] for image in images] From d2c3a042c912bc4ff3d94d05669edbef5822ab53 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 17:37:53 +0200 Subject: [PATCH 08/10] fix: full features cluster example, add integration test --- examples/clusters_example.py | 19 +++++++++++++++---- tests/integration_tests/test_clusters.py | 13 +++++-------- verda/clusters/_clusters.py | 10 +++++----- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/examples/clusters_example.py b/examples/clusters_example.py index 8f7696e..9c76149 100644 --- a/examples/clusters_example.py +++ b/examples/clusters_example.py @@ -6,11 +6,11 @@ - List all clusters - Get a specific cluster by ID - Get cluster nodes -- Scale a cluster - Delete a cluster """ import os +import time from verda import VerdaClient from verda.constants import Actions, Locations @@ -18,9 +18,10 @@ # Get credentials from environment variables CLIENT_ID = os.environ.get('VERDA_CLIENT_ID') CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET') +BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1') # Create client -verda = VerdaClient(CLIENT_ID, CLIENT_SECRET) +verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL) def create_cluster_example(): @@ -34,19 +35,20 @@ def create_cluster_example(): # Get available images for cluster type images = verda.clusters.get_cluster_images('16B200') - if 'ubuntu-22.04-cuda-12.9-cluster' not in images: + if 'ubuntu-22.04-cuda-12.4-cluster' not in images: raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200') # Create a 16B200 cluster cluster = verda.clusters.create( hostname='my-compute-cluster', cluster_type='16B200', - image='ubuntu-22.04-cuda-12.9-cluster', + image='ubuntu-22.04-cuda-12.4-cluster', description='Example compute cluster for distributed training', ssh_key_ids=ssh_keys, location=Locations.FIN_03, shared_volume_name='my-shared-volume', shared_volume_size=30000, + wait_for_status=None, ) print(f'Creating cluster: {cluster.id}') @@ -55,6 +57,15 @@ def create_cluster_example(): print(f'Cluster cluster_type: {cluster.cluster_type}') print(f'Location: {cluster.location}') + # Wait for cluster to enter RUNNING status + while cluster.status != verda.constants.cluster_status.RUNNING: + time.sleep(2) + print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})') + cluster = verda.clusters.get_by_id(cluster.id) + + print(f'Public IP: {cluster.ip}') + print('Cluster is now running and ready to use!') + return cluster diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py index ec11e0c..0ab6d06 100644 --- a/tests/integration_tests/test_clusters.py +++ b/tests/integration_tests/test_clusters.py @@ -62,11 +62,8 @@ def test_create_cluster(self, verda_client: VerdaClient): assert len(cluster.worker_nodes) == 2 assert cluster.ip is not None - print(f'Creating cluster: {cluster.id}') - print(f'Cluster hostname: {cluster.hostname}') - print(f'Cluster status: {cluster.status}') - print(f'Cluster cluster_type: {cluster.cluster_type}') - print(f'Location: {cluster.location}') - - # delete instance - # verda_client.clusters.action(cluster.id, 'delete') + # Now we need to wait for RUNNING status to connect to the jumphost (public IP is available) + # After that, we can connect to the jumphost and run commands on the cluster nodes: + # + # ssh -i ssh_key.pem root@ + # diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py index 1c6cc97..8558db3 100644 --- a/verda/clusters/_clusters.py +++ b/verda/clusters/_clusters.py @@ -204,13 +204,13 @@ def action(self, id_list: list[str] | str, action: str) -> None: Raises: HTTPError: If the action fails or other API error occurs. """ - if type(id_list) is str: - id_list = [id_list] + if action != Actions.DELETE: + raise ValueError(f'Invalid action: {action}. Only DELETE is supported.') - if action == Actions.DELETE: - payload = {'id': id_list, 'action': 'discontinue'} + if type(id_list) is str: + payload = {'actions': [{'id': id_list, 'action': 'discontinue'}]} else: - raise ValueError(f'Invalid action: {action}. Only DELETE is supported.') + payload = {'actions': [{'id': id, 'action': action} for id in id_list]} self._http_client.put(CLUSTERS_ENDPOINT, json=payload) return From f5c275c1323c5f535510da23dfd11aa986097174 Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 17:39:55 +0200 Subject: [PATCH 09/10] fix: unit tests --- tests/unit_tests/clusters/test_clusters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py index d080259..a69ae03 100644 --- a/tests/unit_tests/clusters/test_clusters.py +++ b/tests/unit_tests/clusters/test_clusters.py @@ -108,6 +108,7 @@ def test_create_cluster_successful(self, clusters_service, endpoint): description=CLUSTER_DESCRIPTION, ssh_key_ids=[SSH_KEY_ID], location=CLUSTER_LOCATION, + wait_for_status=CLUSTER_STATUS, ) # assert From 7f9a1c5cf3662d3cd6bf58d1034bd9e24ce1968a Mon Sep 17 00:00:00 2001 From: Ruslan Gainutdinov Date: Mon, 22 Dec 2025 18:01:06 +0200 Subject: [PATCH 10/10] fix: revert to the correct OS images in prod --- examples/clusters_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/clusters_example.py b/examples/clusters_example.py index 9c76149..8aeadf0 100644 --- a/examples/clusters_example.py +++ b/examples/clusters_example.py @@ -35,14 +35,14 @@ def create_cluster_example(): # Get available images for cluster type images = verda.clusters.get_cluster_images('16B200') - if 'ubuntu-22.04-cuda-12.4-cluster' not in images: + if 'ubuntu-22.04-cuda-12.9-cluster' not in images: raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200') # Create a 16B200 cluster cluster = verda.clusters.create( hostname='my-compute-cluster', cluster_type='16B200', - image='ubuntu-22.04-cuda-12.4-cluster', + image='ubuntu-22.04-cuda-12.9-cluster', description='Example compute cluster for distributed training', ssh_key_ids=ssh_keys, location=Locations.FIN_03,