From 0d0e7ab6a5f57ab59bbf027d76345f12ddad6590 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Dec 2025 07:18:33 +0000
Subject: [PATCH 01/10] Add Clusters API wrapper

Implemented comprehensive support for the Clusters API:
- Created ClustersService with methods for cluster management (create, get, delete, scale)
- Added Cluster and ClusterNode dataclasses for type safety
- Integrated clusters service into VerdaClient
- Added ClusterStatus constants for cluster lifecycle management
- Created comprehensive unit tests (13 tests covering all API operations)
- Added detailed example demonstrating cluster operations
- All tests pass (125/125)
---
 examples/clusters_example.py               | 159 ++++++++++
 tests/unit_tests/clusters/__init__.py      |   0
 tests/unit_tests/clusters/test_clusters.py | 353 +++++++++++++++++++++
 verda/_verda.py                            |   4 +
 verda/clusters/__init__.py                 |   5 +
 verda/clusters/_clusters.py                | 234 ++++++++++++++
 verda/constants.py                         |  19 ++
 7 files changed, 774 insertions(+)
 create mode 100644 examples/clusters_example.py
 create mode 100644 tests/unit_tests/clusters/__init__.py
 create mode 100644 tests/unit_tests/clusters/test_clusters.py
 create mode 100644 verda/clusters/__init__.py
 create mode 100644 verda/clusters/_clusters.py

diff --git a/examples/clusters_example.py b/examples/clusters_example.py
new file mode 100644
index 0000000..3ac6e3f
--- /dev/null
+++ b/examples/clusters_example.py
@@ -0,0 +1,159 @@
+"""
+Example demonstrating how to use the Clusters API.
+
+This example shows how to:
+- Create a new compute cluster
+- List all clusters
+- Get a specific cluster by ID
+- Get cluster nodes
+- Scale a cluster
+- Delete a cluster
+"""
+
+import os
+
+from verda import VerdaClient
+from verda.constants import Locations
+
+# Get credentials from environment variables
+CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
+CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
+
+# Create client
+verda = VerdaClient(CLIENT_ID, CLIENT_SECRET)
+
+
+def create_cluster_example():
+    """Create a new compute cluster."""
+    # Get SSH keys
+    ssh_keys = [key.id for key in verda.ssh_keys.get()]
+
+    # Create a cluster with 3 nodes
+    cluster = verda.clusters.create(
+        name='my-compute-cluster',
+        instance_type='8V100.48V',
+        node_count=3,
+        image='ubuntu-24.04-cuda-12.8-open-docker',
+        description='Example compute cluster for distributed training',
+        ssh_key_ids=ssh_keys,
+        location=Locations.FIN_03,
+    )
+
+    print(f'Created cluster: {cluster.id}')
+    print(f'Cluster name: {cluster.name}')
+    print(f'Cluster status: {cluster.status}')
+    print(f'Number of nodes: {cluster.node_count}')
+    print(f'Instance type: {cluster.instance_type}')
+    print(f'Location: {cluster.location}')
+
+    return cluster
+
+
+def list_clusters_example():
+    """List all clusters."""
+    # Get all clusters
+    clusters = verda.clusters.get()
+
+    print(f'\nFound {len(clusters)} cluster(s):')
+    for cluster in clusters:
+        print(f'  - {cluster.name} ({cluster.id}): {cluster.status} - {cluster.node_count} nodes')
+
+    # Get clusters with specific status
+    running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING)
+    print(f'\nFound {len(running_clusters)} running cluster(s)')
+
+    return clusters
+
+
+def get_cluster_by_id_example(cluster_id: str):
+    """Get a specific cluster by ID."""
+    cluster = verda.clusters.get_by_id(cluster_id)
+
+    print('\nCluster details:')
+    print(f'  ID: {cluster.id}')
+    print(f'  Name: {cluster.name}')
+    print(f'  Description: {cluster.description}')
+    print(f'  Status: {cluster.status}')
+    print(f'  Instance type: {cluster.instance_type}')
+    print(f'  Node count: {cluster.node_count}')
+    print(f'  Created at: {cluster.created_at}')
+    if cluster.master_ip:
+        print(f'  Master IP: {cluster.master_ip}')
+    if cluster.endpoint:
+        print(f'  Endpoint: {cluster.endpoint}')
+
+    return cluster
+
+
+def get_cluster_nodes_example(cluster_id: str):
+    """Get all nodes in a cluster."""
+    nodes = verda.clusters.get_nodes(cluster_id)
+
+    print(f'\nCluster has {len(nodes)} node(s):')
+    for i, node in enumerate(nodes, 1):
+        print(f'\n  Node {i}:')
+        print(f'    ID: {node.id}')
+        print(f'    Hostname: {node.hostname}')
+        print(f'    Status: {node.status}')
+        print(f'    IP: {node.ip}')
+        print(f'    Instance type: {node.instance_type}')
+
+    return nodes
+
+
+def scale_cluster_example(cluster_id: str, new_node_count: int):
+    """Scale a cluster to a new number of nodes."""
+    print(f'\nScaling cluster {cluster_id} to {new_node_count} nodes...')
+
+    cluster = verda.clusters.scale(cluster_id, new_node_count)
+
+    print('Cluster scaled successfully')
+    print(f'Current node count: {cluster.node_count}')
+    print(f'Cluster status: {cluster.status}')
+
+    return cluster
+
+
+def delete_cluster_example(cluster_id: str):
+    """Delete a cluster."""
+    print(f'\nDeleting cluster {cluster_id}...')
+
+    verda.clusters.delete(cluster_id)
+
+    print('Cluster deleted successfully')
+
+
+def main():
+    """Run all cluster examples."""
+    print('=== Clusters API Example ===\n')
+
+    # Create a new cluster
+    print('1. Creating a new cluster...')
+    cluster = create_cluster_example()
+    cluster_id = cluster.id
+
+    # List all clusters
+    print('\n2. Listing all clusters...')
+    list_clusters_example()
+
+    # Get cluster by ID
+    print('\n3. Getting cluster details...')
+    get_cluster_by_id_example(cluster_id)
+
+    # Get cluster nodes
+    print('\n4. Getting cluster nodes...')
+    get_cluster_nodes_example(cluster_id)
+
+    # Scale the cluster
+    print('\n5. Scaling the cluster...')
+    scale_cluster_example(cluster_id, 5)
+
+    # Delete the cluster
+    print('\n6. Deleting the cluster...')
+    delete_cluster_example(cluster_id)
+
+    print('\n=== Example completed successfully ===')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/unit_tests/clusters/__init__.py b/tests/unit_tests/clusters/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py
new file mode 100644
index 0000000..9211fc4
--- /dev/null
+++ b/tests/unit_tests/clusters/test_clusters.py
@@ -0,0 +1,353 @@
+import pytest
+import responses  # https://github.com/getsentry/responses
+
+from verda.clusters import Cluster, ClusterNode, ClustersService
+from verda.constants import ErrorCodes, Locations
+from verda.exceptions import APIException
+
+INVALID_REQUEST = ErrorCodes.INVALID_REQUEST
+INVALID_REQUEST_MESSAGE = 'Invalid cluster request'
+
+CLUSTER_ID = 'deadc0de-a5d2-4972-ae4e-d429115d055b'
+SSH_KEY_ID = '12345dc1-a5d2-4972-ae4e-d429115d055b'
+
+CLUSTER_NAME = 'test-cluster'
+CLUSTER_DESCRIPTION = 'Test compute cluster'
+CLUSTER_STATUS = 'running'
+CLUSTER_INSTANCE_TYPE = '8V100.48V'
+CLUSTER_NODE_COUNT = 3
+CLUSTER_LOCATION = Locations.FIN_01
+CLUSTER_IMAGE = 'ubuntu-24.04-cuda-12.8-open-docker'
+CLUSTER_CREATED_AT = '2024-01-01T00:00:00Z'
+CLUSTER_MASTER_IP = '10.0.0.1'
+CLUSTER_ENDPOINT = 'cluster-endpoint.verda.com'
+
+NODE_1_ID = 'node1-c0de-a5d2-4972-ae4e-d429115d055b'
+NODE_2_ID = 'node2-c0de-a5d2-4972-ae4e-d429115d055b'
+NODE_3_ID = 'node3-c0de-a5d2-4972-ae4e-d429115d055b'
+
+NODES_PAYLOAD = [
+    {
+        'id': NODE_1_ID,
+        'instance_type': CLUSTER_INSTANCE_TYPE,
+        'status': 'running',
+        'hostname': 'test-cluster-node-1',
+        'ip': '10.0.0.2',
+        'created_at': CLUSTER_CREATED_AT,
+    },
+    {
+        'id': NODE_2_ID,
+        'instance_type': CLUSTER_INSTANCE_TYPE,
+        'status': 'running',
+        'hostname': 'test-cluster-node-2',
+        'ip': '10.0.0.3',
+        'created_at': CLUSTER_CREATED_AT,
+    },
+    {
+        'id': NODE_3_ID,
+        'instance_type': CLUSTER_INSTANCE_TYPE,
+        'status': 'running',
+        'hostname': 'test-cluster-node-3',
+        'ip': '10.0.0.4',
+        'created_at': CLUSTER_CREATED_AT,
+    },
+]
+
+CLUSTER_PAYLOAD = [
+    {
+        'id': CLUSTER_ID,
+        'name': CLUSTER_NAME,
+        'description': CLUSTER_DESCRIPTION,
+        'status': CLUSTER_STATUS,
+        'created_at': CLUSTER_CREATED_AT,
+        'location': CLUSTER_LOCATION,
+        'instance_type': CLUSTER_INSTANCE_TYPE,
+        'node_count': CLUSTER_NODE_COUNT,
+        'nodes': NODES_PAYLOAD,
+        'ssh_key_ids': [SSH_KEY_ID],
+        'image': CLUSTER_IMAGE,
+        'master_ip': CLUSTER_MASTER_IP,
+        'endpoint': CLUSTER_ENDPOINT,
+    }
+]
+
+
+class TestClustersService:
+    @pytest.fixture
+    def clusters_service(self, http_client):
+        return ClustersService(http_client)
+
+    @pytest.fixture
+    def endpoint(self, http_client):
+        return http_client._base_url + '/clusters'
+
+    def test_get_clusters(self, clusters_service, endpoint):
+        # arrange - add response mock
+        responses.add(responses.GET, endpoint, json=CLUSTER_PAYLOAD, status=200)
+
+        # act
+        clusters = clusters_service.get()
+        cluster = clusters[0]
+
+        # assert
+        assert isinstance(clusters, list)
+        assert len(clusters) == 1
+        assert isinstance(cluster, Cluster)
+        assert cluster.id == CLUSTER_ID
+        assert cluster.name == CLUSTER_NAME
+        assert cluster.description == CLUSTER_DESCRIPTION
+        assert cluster.status == CLUSTER_STATUS
+        assert cluster.created_at == CLUSTER_CREATED_AT
+        assert cluster.location == CLUSTER_LOCATION
+        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
+        assert cluster.node_count == CLUSTER_NODE_COUNT
+        assert isinstance(cluster.nodes, list)
+        assert len(cluster.nodes) == CLUSTER_NODE_COUNT
+        assert isinstance(cluster.nodes[0], ClusterNode)
+        assert cluster.ssh_key_ids == [SSH_KEY_ID]
+        assert cluster.image == CLUSTER_IMAGE
+        assert cluster.master_ip == CLUSTER_MASTER_IP
+        assert cluster.endpoint == CLUSTER_ENDPOINT
+        assert responses.assert_call_count(endpoint, 1) is True
+
+    def test_get_clusters_by_status_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '?status=running'
+        responses.add(responses.GET, url, json=CLUSTER_PAYLOAD, status=200)
+
+        # act
+        clusters = clusters_service.get(status='running')
+        cluster = clusters[0]
+
+        # assert
+        assert isinstance(clusters, list)
+        assert len(clusters) == 1
+        assert isinstance(cluster, Cluster)
+        assert cluster.id == CLUSTER_ID
+        assert cluster.status == CLUSTER_STATUS
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_get_clusters_by_status_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '?status=invalid_status'
+        responses.add(
+            responses.GET,
+            url,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.get(status='invalid_status')
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_get_cluster_by_id_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/' + CLUSTER_ID
+        responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200)
+
+        # act
+        cluster = clusters_service.get_by_id(CLUSTER_ID)
+
+        # assert
+        assert isinstance(cluster, Cluster)
+        assert cluster.id == CLUSTER_ID
+        assert cluster.name == CLUSTER_NAME
+        assert cluster.description == CLUSTER_DESCRIPTION
+        assert cluster.status == CLUSTER_STATUS
+        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
+        assert cluster.node_count == CLUSTER_NODE_COUNT
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_get_cluster_by_id_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/invalid_id'
+        responses.add(
+            responses.GET,
+            url,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.get_by_id('invalid_id')
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_create_cluster_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        # create cluster
+        responses.add(responses.POST, endpoint, body=CLUSTER_ID, status=200)
+        # get cluster by id
+        url = endpoint + '/' + CLUSTER_ID
+        responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200)
+
+        # act
+        cluster = clusters_service.create(
+            name=CLUSTER_NAME,
+            instance_type=CLUSTER_INSTANCE_TYPE,
+            node_count=CLUSTER_NODE_COUNT,
+            image=CLUSTER_IMAGE,
+            description=CLUSTER_DESCRIPTION,
+            ssh_key_ids=[SSH_KEY_ID],
+            location=CLUSTER_LOCATION,
+        )
+
+        # assert
+        assert isinstance(cluster, Cluster)
+        assert cluster.id == CLUSTER_ID
+        assert cluster.name == CLUSTER_NAME
+        assert cluster.description == CLUSTER_DESCRIPTION
+        assert cluster.status == CLUSTER_STATUS
+        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
+        assert cluster.node_count == CLUSTER_NODE_COUNT
+        assert cluster.ssh_key_ids == [SSH_KEY_ID]
+        assert cluster.location == CLUSTER_LOCATION
+        assert cluster.image == CLUSTER_IMAGE
+        assert responses.assert_call_count(endpoint, 1) is True
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_create_cluster_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        responses.add(
+            responses.POST,
+            endpoint,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.create(
+                name=CLUSTER_NAME,
+                instance_type=CLUSTER_INSTANCE_TYPE,
+                node_count=CLUSTER_NODE_COUNT,
+                image=CLUSTER_IMAGE,
+                description=CLUSTER_DESCRIPTION,
+            )
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(endpoint, 1) is True
+
+    def test_delete_cluster_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/' + CLUSTER_ID
+        responses.add(responses.DELETE, url, status=202)
+
+        # act
+        result = clusters_service.delete(CLUSTER_ID)
+
+        # assert
+        assert result is None
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_delete_cluster_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/invalid_id'
+        responses.add(
+            responses.DELETE,
+            url,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.delete('invalid_id')
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_scale_cluster_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        new_node_count = 5
+        scaled_payload = CLUSTER_PAYLOAD[0].copy()
+        scaled_payload['node_count'] = new_node_count
+
+        # scale endpoint
+        scale_url = endpoint + '/' + CLUSTER_ID + '/scale'
+        responses.add(responses.PUT, scale_url, status=200)
+
+        # get cluster by id
+        get_url = endpoint + '/' + CLUSTER_ID
+        responses.add(responses.GET, get_url, json=scaled_payload, status=200)
+
+        # act
+        cluster = clusters_service.scale(CLUSTER_ID, new_node_count)
+
+        # assert
+        assert isinstance(cluster, Cluster)
+        assert cluster.id == CLUSTER_ID
+        assert cluster.node_count == new_node_count
+        assert responses.assert_call_count(scale_url, 1) is True
+        assert responses.assert_call_count(get_url, 1) is True
+
+    def test_scale_cluster_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/' + CLUSTER_ID + '/scale'
+        responses.add(
+            responses.PUT,
+            url,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.scale(CLUSTER_ID, 5)
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_get_cluster_nodes_successful(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/' + CLUSTER_ID + '/nodes'
+        responses.add(responses.GET, url, json=NODES_PAYLOAD, status=200)
+
+        # act
+        nodes = clusters_service.get_nodes(CLUSTER_ID)
+
+        # assert
+        assert isinstance(nodes, list)
+        assert len(nodes) == CLUSTER_NODE_COUNT
+        assert isinstance(nodes[0], ClusterNode)
+        assert nodes[0].id == NODE_1_ID
+        assert nodes[0].instance_type == CLUSTER_INSTANCE_TYPE
+        assert nodes[0].status == 'running'
+        assert nodes[1].id == NODE_2_ID
+        assert nodes[2].id == NODE_3_ID
+        assert responses.assert_call_count(url, 1) is True
+
+    def test_get_cluster_nodes_failed(self, clusters_service, endpoint):
+        # arrange - add response mock
+        url = endpoint + '/invalid_id/nodes'
+        responses.add(
+            responses.GET,
+            url,
+            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
+            status=400,
+        )
+
+        # act
+        with pytest.raises(APIException) as excinfo:
+            clusters_service.get_nodes('invalid_id')
+
+        # assert
+        assert excinfo.value.code == INVALID_REQUEST
+        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
+        assert responses.assert_call_count(url, 1) is True
diff --git a/verda/_verda.py b/verda/_verda.py
index b3d9922..a4b4cfc 100644
--- a/verda/_verda.py
+++ b/verda/_verda.py
@@ -1,6 +1,7 @@
 from verda._version import __version__
 from verda.authentication import AuthenticationService
 from verda.balance import BalanceService
+from verda.clusters import ClustersService
 from verda.constants import Constants
 from verda.containers import ContainersService
 from verda.http_client import HTTPClient
@@ -79,5 +80,8 @@ def __init__(
         self.containers: ContainersService = ContainersService(self._http_client, inference_key)
         """Containers service. Deploy, manage, and monitor container deployments"""
 
+        self.clusters: ClustersService = ClustersService(self._http_client)
+        """Clusters service. Create, manage, and scale compute clusters"""
+
 
 __all__ = ['VerdaClient']
diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py
new file mode 100644
index 0000000..ff07e58
--- /dev/null
+++ b/verda/clusters/__init__.py
@@ -0,0 +1,5 @@
+"""Clusters service for managing compute clusters."""
+
+from verda.clusters._clusters import Cluster, ClusterNode, ClustersService
+
+__all__ = ['Cluster', 'ClusterNode', 'ClustersService']
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
new file mode 100644
index 0000000..f0010d7
--- /dev/null
+++ b/verda/clusters/_clusters.py
@@ -0,0 +1,234 @@
+import itertools
+import time
+from dataclasses import dataclass
+from typing import Literal
+
+from dataclasses_json import dataclass_json
+
+from verda.constants import Locations
+
+CLUSTERS_ENDPOINT = '/clusters'
+
+ClusterStatus = Literal[
+    'creating', 'running', 'scaling', 'updating', 'deleting', 'deleted', 'error'
+]
+
+
+@dataclass_json
+@dataclass
+class ClusterNode:
+    """Represents a node in a cluster.
+
+    Attributes:
+        id: Unique identifier for the node (instance ID).
+        instance_type: Type of the instance for this node.
+        status: Current status of the node.
+        hostname: Network hostname of the node.
+        ip: IP address of the node.
+        created_at: Timestamp of node creation.
+    """
+
+    id: str
+    instance_type: str
+    status: str
+    hostname: str
+    ip: str | None = None
+    created_at: str | None = None
+
+
+@dataclass_json
+@dataclass
+class Cluster:
+    """Represents a compute cluster with multiple nodes.
+
+    Attributes:
+        id: Unique identifier for the cluster.
+        name: Human-readable name of the cluster.
+        description: Description of the cluster.
+        status: Current operational status of the cluster.
+        created_at: Timestamp of cluster creation.
+        location: Datacenter location code (default: Locations.FIN_03).
+        instance_type: Type of instances used for cluster nodes.
+        node_count: Number of nodes in the cluster.
+        nodes: List of nodes in the cluster.
+        ssh_key_ids: List of SSH key IDs associated with the cluster nodes.
+        image: Image ID or type used for cluster nodes.
+        startup_script_id: ID of the startup script to run on nodes.
+        master_ip: IP address of the cluster master/coordinator node.
+        endpoint: Cluster access endpoint.
+    """
+
+    id: str
+    name: str
+    description: str
+    status: str
+    created_at: str
+    location: str
+    instance_type: str
+    node_count: int
+    nodes: list[ClusterNode]
+    ssh_key_ids: list[str]
+    image: str | None = None
+    startup_script_id: str | None = None
+    master_ip: str | None = None
+    endpoint: str | None = None
+
+
+class ClustersService:
+    """Service for managing compute clusters through the API.
+
+    This service provides methods to create, retrieve, scale, and manage compute clusters.
+    """
+
+    def __init__(self, http_client) -> None:
+        """Initializes the ClustersService with an HTTP client.
+
+        Args:
+            http_client: HTTP client for making API requests.
+        """
+        self._http_client = http_client
+
+    def get(self, status: str | None = None) -> list[Cluster]:
+        """Retrieves all clusters or clusters with specific status.
+
+        Args:
+            status: Optional status filter for clusters. If None, returns all
+                non-deleted clusters.
+
+        Returns:
+            List of cluster objects matching the criteria.
+        """
+        clusters_dict = self._http_client.get(CLUSTERS_ENDPOINT, params={'status': status}).json()
+        return [
+            Cluster.from_dict(cluster_dict, infer_missing=True) for cluster_dict in clusters_dict
+        ]
+
+    def get_by_id(self, id: str) -> Cluster:
+        """Retrieves a specific cluster by its ID.
+
+        Args:
+            id: Unique identifier of the cluster to retrieve.
+
+        Returns:
+            Cluster object with the specified ID.
+
+        Raises:
+            HTTPError: If the cluster is not found or other API error occurs.
+        """
+        cluster_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}').json()
+        return Cluster.from_dict(cluster_dict, infer_missing=True)
+
+    def create(
+        self,
+        name: str,
+        instance_type: str,
+        node_count: int,
+        image: str,
+        description: str,
+        ssh_key_ids: list = [],
+        location: str = Locations.FIN_03,
+        startup_script_id: str | None = None,
+        *,
+        max_wait_time: float = 300,
+        initial_interval: float = 1.0,
+        max_interval: float = 10,
+        backoff_coefficient: float = 2.0,
+    ) -> Cluster:
+        """Creates and deploys a new compute cluster.
+
+        Args:
+            name: Name for the cluster.
+            instance_type: Type of instances to use for cluster nodes (e.g., '8V100.48V').
+            node_count: Number of nodes to create in the cluster.
+            image: Image type or ID for cluster nodes.
+            description: Human-readable description of the cluster.
+            ssh_key_ids: List of SSH key IDs to associate with cluster nodes.
+            location: Datacenter location code (default: Locations.FIN_03).
+            startup_script_id: Optional ID of startup script to run on nodes.
+            max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 300)
+            initial_interval: Initial interval, in seconds (default: 1.0)
+            max_interval: The longest single delay allowed between retries, in seconds (default: 10)
+            backoff_coefficient: Coefficient to calculate the next retry interval (default 2.0)
+
+        Returns:
+            The newly created cluster object.
+
+        Raises:
+            HTTPError: If cluster creation fails or other API error occurs.
+            TimeoutError: If cluster does not start creating within max_wait_time.
+        """
+        payload = {
+            'name': name,
+            'instance_type': instance_type,
+            'node_count': node_count,
+            'image': image,
+            'description': description,
+            'ssh_key_ids': ssh_key_ids,
+            'location_code': location,
+            'startup_script_id': startup_script_id,
+        }
+        id = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).text
+
+        # Wait for cluster to enter creating state with timeout
+        deadline = time.monotonic() + max_wait_time
+        for i in itertools.count():
+            cluster = self.get_by_id(id)
+            if cluster.status != 'ordered':
+                return cluster
+
+            now = time.monotonic()
+            if now >= deadline:
+                raise TimeoutError(
+                    f'Cluster {id} did not enter creating state within {max_wait_time:.1f} seconds'
+                )
+
+            interval = min(initial_interval * backoff_coefficient**i, max_interval, deadline - now)
+            time.sleep(interval)
+
+    def delete(self, id: str) -> None:
+        """Deletes a cluster and all its nodes.
+
+        Args:
+            id: Unique identifier of the cluster to delete.
+
+        Raises:
+            HTTPError: If the deletion fails or other API error occurs.
+        """
+        self._http_client.delete(CLUSTERS_ENDPOINT + f'/{id}')
+        return
+
+    def scale(
+        self,
+        id: str,
+        node_count: int,
+    ) -> Cluster:
+        """Scales a cluster to the specified number of nodes.
+
+        Args:
+            id: Unique identifier of the cluster to scale.
+            node_count: Target number of nodes for the cluster.
+
+        Returns:
+            Updated cluster object.
+
+        Raises:
+            HTTPError: If the scaling fails or other API error occurs.
+        """
+        payload = {'node_count': node_count}
+        self._http_client.put(CLUSTERS_ENDPOINT + f'/{id}/scale', json=payload)
+        return self.get_by_id(id)
+
+    def get_nodes(self, id: str) -> list[ClusterNode]:
+        """Retrieves all nodes in a cluster.
+
+        Args:
+            id: Unique identifier of the cluster.
+
+        Returns:
+            List of nodes in the cluster.
+
+        Raises:
+            HTTPError: If the cluster is not found or other API error occurs.
+        """
+        nodes_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}/nodes').json()
+        return [ClusterNode.from_dict(node_dict, infer_missing=True) for node_dict in nodes_dict]
diff --git a/verda/constants.py b/verda/constants.py
index 70b789f..a339a42 100644
--- a/verda/constants.py
+++ b/verda/constants.py
@@ -56,6 +56,22 @@ def __init__(self):
         return
 
 
+class ClusterStatus:
+    """Cluster status."""
+
+    ORDERED = 'ordered'
+    CREATING = 'creating'
+    RUNNING = 'running'
+    SCALING = 'scaling'
+    UPDATING = 'updating'
+    DELETING = 'deleting'
+    DELETED = 'deleted'
+    ERROR = 'error'
+
+    def __init__(self):
+        return
+
+
 class VolumeTypes:
     """Storage volume types."""
 
@@ -110,6 +126,9 @@ def __init__(self, base_url, version):
         self.volume_status: VolumeStatus = VolumeStatus()
         """Possible volume statuses"""
 
+        self.cluster_status: ClusterStatus = ClusterStatus()
+        """Possible cluster statuses"""
+
         self.volume_types: VolumeTypes = VolumeTypes()
         """Available volume types"""
 

From d6b9918de5bb6bb5618447a2e8fba3d9e03d4020 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Fri, 19 Dec 2025 17:22:55 +0200
Subject: [PATCH 02/10] feat: clusters api

---
 examples/clusters_example.py             |  71 ++++---------
 tests/integration_tests/conftest.py      |   2 +-
 tests/integration_tests/test_clusters.py |  42 ++++++++
 verda/clusters/__init__.py               |   4 +-
 verda/clusters/_clusters.py              | 122 ++++++++++++-----------
 5 files changed, 125 insertions(+), 116 deletions(-)
 create mode 100644 tests/integration_tests/test_clusters.py

diff --git a/examples/clusters_example.py b/examples/clusters_example.py
index 3ac6e3f..b0efdea 100644
--- a/examples/clusters_example.py
+++ b/examples/clusters_example.py
@@ -13,7 +13,7 @@
 import os
 
 from verda import VerdaClient
-from verda.constants import Locations
+from verda.constants import Actions, Locations
 
 # Get credentials from environment variables
 CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
@@ -30,20 +30,21 @@ def create_cluster_example():
 
     # Create a cluster with 3 nodes
     cluster = verda.clusters.create(
-        name='my-compute-cluster',
-        instance_type='8V100.48V',
-        node_count=3,
-        image='ubuntu-24.04-cuda-12.8-open-docker',
+        hostname='my-compute-cluster',
+        cluster_type='16H200',
+        image='ubuntu-22.04-cuda-12.4-cluster',
         description='Example compute cluster for distributed training',
         ssh_key_ids=ssh_keys,
         location=Locations.FIN_03,
+        shared_volume_name='my-shared-volume',
+        shared_volume_size=30000,
     )
 
     print(f'Created cluster: {cluster.id}')
-    print(f'Cluster name: {cluster.name}')
+    print(f'Cluster hostname: {cluster.hostname}')
     print(f'Cluster status: {cluster.status}')
-    print(f'Number of nodes: {cluster.node_count}')
-    print(f'Instance type: {cluster.instance_type}')
+    print(f'Cluster cluster_type: {cluster.cluster_type}')
+    print(f'Cluster worker_nodes: {cluster.worker_nodes}')
     print(f'Location: {cluster.location}')
 
     return cluster
@@ -56,7 +57,9 @@ def list_clusters_example():
 
     print(f'\nFound {len(clusters)} cluster(s):')
     for cluster in clusters:
-        print(f'  - {cluster.name} ({cluster.id}): {cluster.status} - {cluster.node_count} nodes')
+        print(
+            f'  - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes'
+        )
 
     # Get clusters with specific status
     running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING)
@@ -71,45 +74,13 @@ def get_cluster_by_id_example(cluster_id: str):
 
     print('\nCluster details:')
     print(f'  ID: {cluster.id}')
-    print(f'  Name: {cluster.name}')
+    print(f'  Name: {cluster.hostname}')
     print(f'  Description: {cluster.description}')
     print(f'  Status: {cluster.status}')
-    print(f'  Instance type: {cluster.instance_type}')
-    print(f'  Node count: {cluster.node_count}')
+    print(f'  Cluster type: {cluster.cluster_type}')
     print(f'  Created at: {cluster.created_at}')
-    if cluster.master_ip:
-        print(f'  Master IP: {cluster.master_ip}')
-    if cluster.endpoint:
-        print(f'  Endpoint: {cluster.endpoint}')
-
-    return cluster
-
-
-def get_cluster_nodes_example(cluster_id: str):
-    """Get all nodes in a cluster."""
-    nodes = verda.clusters.get_nodes(cluster_id)
-
-    print(f'\nCluster has {len(nodes)} node(s):')
-    for i, node in enumerate(nodes, 1):
-        print(f'\n  Node {i}:')
-        print(f'    ID: {node.id}')
-        print(f'    Hostname: {node.hostname}')
-        print(f'    Status: {node.status}')
-        print(f'    IP: {node.ip}')
-        print(f'    Instance type: {node.instance_type}')
-
-    return nodes
-
-
-def scale_cluster_example(cluster_id: str, new_node_count: int):
-    """Scale a cluster to a new number of nodes."""
-    print(f'\nScaling cluster {cluster_id} to {new_node_count} nodes...')
-
-    cluster = verda.clusters.scale(cluster_id, new_node_count)
-
-    print('Cluster scaled successfully')
-    print(f'Current node count: {cluster.node_count}')
-    print(f'Cluster status: {cluster.status}')
+    print(f'  Public IP: {cluster.ip}')
+    print(f'  Worker nodes: {len(cluster.worker_nodes)}')
 
     return cluster
 
@@ -118,7 +89,7 @@ def delete_cluster_example(cluster_id: str):
     """Delete a cluster."""
     print(f'\nDeleting cluster {cluster_id}...')
 
-    verda.clusters.delete(cluster_id)
+    verda.clusters.action(cluster_id, Actions.DELETE)
 
     print('Cluster deleted successfully')
 
@@ -140,14 +111,6 @@ def main():
     print('\n3. Getting cluster details...')
     get_cluster_by_id_example(cluster_id)
 
-    # Get cluster nodes
-    print('\n4. Getting cluster nodes...')
-    get_cluster_nodes_example(cluster_id)
-
-    # Scale the cluster
-    print('\n5. Scaling the cluster...')
-    scale_cluster_example(cluster_id, 5)
-
     # Delete the cluster
     print('\n6. Deleting the cluster...')
     delete_cluster_example(cluster_id)
diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py
index 3f695ab..78a3260 100644
--- a/tests/integration_tests/conftest.py
+++ b/tests/integration_tests/conftest.py
@@ -9,7 +9,7 @@
 Make sure to run the server and the account has enough balance before running the tests
 """
 
-BASE_URL = 'http://localhost:3010/v1'
+BASE_URL = os.getenv('VERDA_BASE_URL', 'https://api.verda.com/v1')
 
 # Load env variables, make sure there's an env file with valid client credentials
 load_dotenv()
diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py
new file mode 100644
index 0000000..904f711
--- /dev/null
+++ b/tests/integration_tests/test_clusters.py
@@ -0,0 +1,42 @@
+import os
+
+import pytest
+
+from verda import VerdaClient
+from verda.constants import Locations
+
+IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true'
+
+
+@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Test doesn't work in Github Actions.")
+@pytest.mark.withoutresponses
+class TestClusters:
+    def test_create_cluster(self, verda_client: VerdaClient):
+        # get ssh key
+        ssh_key = verda_client.ssh_keys.get()[0]
+
+        # create instance
+        cluster = verda_client.clusters.create(
+            hostname='test-instance',
+            location=Locations.FIN_03,
+            cluster_type='16H200',
+            description='test instance',
+            image='ubuntu-22.04-cuda-12.4-cluster',
+            ssh_key_ids=[ssh_key.id],
+        )
+
+        # assert instance is created
+        assert cluster.id is not None
+        assert (
+            cluster.status == verda_client.constants.instance_status.PROVISIONING
+            or cluster.status == verda_client.constants.instance_status.RUNNING
+        )
+
+        assert cluster.worker_nodes is not None
+        assert len(cluster.worker_nodes) == 2
+        assert cluster.ip is not None
+
+        print(cluster)
+
+        # delete instance
+        # verda_client.clusters.action(cluster.id, 'delete')
diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py
index ff07e58..7ffe932 100644
--- a/verda/clusters/__init__.py
+++ b/verda/clusters/__init__.py
@@ -1,5 +1,5 @@
 """Clusters service for managing compute clusters."""
 
-from verda.clusters._clusters import Cluster, ClusterNode, ClustersService
+from verda.clusters._clusters import Cluster, ClusterWorkerNode, ClustersService
 
-__all__ = ['Cluster', 'ClusterNode', 'ClustersService']
+__all__ = ['Cluster', 'ClusterWorkerNode', 'ClustersService']
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index f0010d7..c0031c3 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -5,7 +5,7 @@
 
 from dataclasses_json import dataclass_json
 
-from verda.constants import Locations
+from verda.constants import Actions, Locations
 
 CLUSTERS_ENDPOINT = '/clusters'
 
@@ -16,24 +16,20 @@
 
 @dataclass_json
 @dataclass
-class ClusterNode:
-    """Represents a node in a cluster.
+class ClusterWorkerNode:
+    """Represents a worker node in a cluster.
 
     Attributes:
         id: Unique identifier for the node (instance ID).
-        instance_type: Type of the instance for this node.
         status: Current status of the node.
         hostname: Network hostname of the node.
-        ip: IP address of the node.
-        created_at: Timestamp of node creation.
+        private_ip: Private IP address of the node.
     """
 
     id: str
-    instance_type: str
     status: str
     hostname: str
-    ip: str | None = None
-    created_at: str | None = None
+    private_ip: str
 
 
 @dataclass_json
@@ -43,35 +39,32 @@ class Cluster:
 
     Attributes:
         id: Unique identifier for the cluster.
-        name: Human-readable name of the cluster.
+        hostname: Human-readable hostname of the cluster.
         description: Description of the cluster.
         status: Current operational status of the cluster.
         created_at: Timestamp of cluster creation.
         location: Datacenter location code (default: Locations.FIN_03).
-        instance_type: Type of instances used for cluster nodes.
-        node_count: Number of nodes in the cluster.
-        nodes: List of nodes in the cluster.
+        cluster_type: Type of instances used for cluster nodes.
+        worker_nodes: List of nodes in the cluster.
         ssh_key_ids: List of SSH key IDs associated with the cluster nodes.
         image: Image ID or type used for cluster nodes.
         startup_script_id: ID of the startup script to run on nodes.
-        master_ip: IP address of the cluster master/coordinator node.
-        endpoint: Cluster access endpoint.
+        public_ip: IP address of the jumphost.
     """
 
     id: str
-    name: str
+    hostname: str
     description: str
     status: str
     created_at: str
     location: str
-    instance_type: str
+    cluster_type: str
     node_count: int
-    nodes: list[ClusterNode]
+    worker_nodes: list[ClusterWorkerNode]
     ssh_key_ids: list[str]
     image: str | None = None
     startup_script_id: str | None = None
-    master_ip: str | None = None
-    endpoint: str | None = None
+    ip: str | None = None
 
 
 class ClustersService:
@@ -120,16 +113,17 @@ def get_by_id(self, id: str) -> Cluster:
 
     def create(
         self,
-        name: str,
-        instance_type: str,
-        node_count: int,
+        hostname: str,
+        cluster_type: str,
         image: str,
         description: str,
         ssh_key_ids: list = [],
         location: str = Locations.FIN_03,
         startup_script_id: str | None = None,
+        shared_volume_name: str | None = None,
+        shared_volume_size: int | None = None,
         *,
-        max_wait_time: float = 300,
+        max_wait_time: float = 900,
         initial_interval: float = 1.0,
         max_interval: float = 10,
         backoff_coefficient: float = 2.0,
@@ -138,14 +132,15 @@ def create(
 
         Args:
             name: Name for the cluster.
-            instance_type: Type of instances to use for cluster nodes (e.g., '8V100.48V').
-            node_count: Number of nodes to create in the cluster.
+            cluster_type: Cluster type.
             image: Image type or ID for cluster nodes.
             description: Human-readable description of the cluster.
             ssh_key_ids: List of SSH key IDs to associate with cluster nodes.
             location: Datacenter location code (default: Locations.FIN_03).
             startup_script_id: Optional ID of startup script to run on nodes.
-            max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 300)
+            shared_volume_name: Optional name for the shared volume.
+            shared_volume_size: Optional size for the shared volume, in GB, default to 30TB.
+            max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 900)
             initial_interval: Initial interval, in seconds (default: 1.0)
             max_interval: The longest single delay allowed between retries, in seconds (default: 10)
             backoff_coefficient: Coefficient to calculate the next retry interval (default 2.0)
@@ -158,16 +153,21 @@ def create(
             TimeoutError: If cluster does not start creating within max_wait_time.
         """
         payload = {
-            'name': name,
-            'instance_type': instance_type,
-            'node_count': node_count,
+            'hostname': hostname,
+            'cluster_type': cluster_type,
             'image': image,
             'description': description,
             'ssh_key_ids': ssh_key_ids,
+            'contract': 'PAY_AS_YOU_GO',
             'location_code': location,
             'startup_script_id': startup_script_id,
+            'shared_volume': {
+                'name': shared_volume_name if shared_volume_name else hostname + '-shared-volume',
+                'size': shared_volume_size if shared_volume_size else 30000,
+            },
         }
-        id = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).text
+        response = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).json()
+        id = response['id']
 
         # Wait for cluster to enter creating state with timeout
         deadline = time.monotonic() + max_wait_time
@@ -185,50 +185,54 @@ def create(
             interval = min(initial_interval * backoff_coefficient**i, max_interval, deadline - now)
             time.sleep(interval)
 
-    def delete(self, id: str) -> None:
-        """Deletes a cluster and all its nodes.
+    def action(self, id_list: list[str] | str, action: str) -> None:
+        """Performs an action on one or more instances.
 
         Args:
-            id: Unique identifier of the cluster to delete.
+            id_list: Single instance ID or list of instance IDs to act upon.
+            action: Action to perform on the clusters. Only `delete` is supported.
 
         Raises:
-            HTTPError: If the deletion fails or other API error occurs.
+            HTTPError: If the action fails or other API error occurs.
         """
-        self._http_client.delete(CLUSTERS_ENDPOINT + f'/{id}')
+        if type(id_list) is str:
+            id_list = [id_list]
+
+        if action == Actions.DELETE:
+            payload = {'id': id_list, 'action': 'discontinue'}
+        else:
+            raise ValueError(f'Invalid action: {action}. Only DELETE is supported.')
+
+        self._http_client.put(CLUSTERS_ENDPOINT, json=payload)
         return
 
-    def scale(
+    def is_available(
         self,
-        id: str,
-        node_count: int,
-    ) -> Cluster:
-        """Scales a cluster to the specified number of nodes.
+        cluster_type: str,
+        location_code: str | None = None,
+    ) -> bool:
+        """Checks if a specific instance type is available for deployment.
 
         Args:
-            id: Unique identifier of the cluster to scale.
-            node_count: Target number of nodes for the cluster.
+            cluster_type: Type of instance to check availability for.
+            location_code: Optional datacenter location code.
 
         Returns:
-            Updated cluster object.
-
-        Raises:
-            HTTPError: If the scaling fails or other API error occurs.
+            True if the instance type is available, False otherwise.
         """
-        payload = {'node_count': node_count}
-        self._http_client.put(CLUSTERS_ENDPOINT + f'/{id}/scale', json=payload)
-        return self.get_by_id(id)
+        is_spot = str(is_spot).lower()
+        query_params = {'location_code': location_code}
+        url = f'/cluster-availability/{cluster_type}'
+        return self._http_client.get(url, query_params).json()
 
-    def get_nodes(self, id: str) -> list[ClusterNode]:
-        """Retrieves all nodes in a cluster.
+    def get_availabilities(self, location_code: str | None = None) -> list[dict]:
+        """Retrieves a list of available cluster types across locations.
 
         Args:
-            id: Unique identifier of the cluster.
+            location_code: Optional datacenter location code to filter by.
 
         Returns:
-            List of nodes in the cluster.
-
-        Raises:
-            HTTPError: If the cluster is not found or other API error occurs.
+            List of available cluster types and their details.
         """
-        nodes_dict = self._http_client.get(CLUSTERS_ENDPOINT + f'/{id}/nodes').json()
-        return [ClusterNode.from_dict(node_dict, infer_missing=True) for node_dict in nodes_dict]
+        query_params = {'location_code': location_code}
+        return self._http_client.get('/cluster-availability', params=query_params).json()

From 814d02eed449f02e4cebc87371eed889b5238243 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Fri, 19 Dec 2025 17:47:33 +0200
Subject: [PATCH 03/10] fix: unit test

---
 tests/unit_tests/clusters/test_clusters.py | 226 +++------------------
 1 file changed, 27 insertions(+), 199 deletions(-)

diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py
index 9211fc4..71d7e03 100644
--- a/tests/unit_tests/clusters/test_clusters.py
+++ b/tests/unit_tests/clusters/test_clusters.py
@@ -1,73 +1,57 @@
 import pytest
 import responses  # https://github.com/getsentry/responses
 
-from verda.clusters import Cluster, ClusterNode, ClustersService
+from verda.clusters import Cluster, ClusterWorkerNode, ClustersService
 from verda.constants import ErrorCodes, Locations
 from verda.exceptions import APIException
 
 INVALID_REQUEST = ErrorCodes.INVALID_REQUEST
-INVALID_REQUEST_MESSAGE = 'Invalid cluster request'
+INVALID_REQUEST_MESSAGE = 'Invalid request'
 
 CLUSTER_ID = 'deadc0de-a5d2-4972-ae4e-d429115d055b'
 SSH_KEY_ID = '12345dc1-a5d2-4972-ae4e-d429115d055b'
 
-CLUSTER_NAME = 'test-cluster'
+CLUSTER_HOSTNAME = 'test-cluster'
 CLUSTER_DESCRIPTION = 'Test compute cluster'
 CLUSTER_STATUS = 'running'
-CLUSTER_INSTANCE_TYPE = '8V100.48V'
-CLUSTER_NODE_COUNT = 3
-CLUSTER_LOCATION = Locations.FIN_01
-CLUSTER_IMAGE = 'ubuntu-24.04-cuda-12.8-open-docker'
+CLUSTER_CLUSTER_TYPE = '16H200'
+CLUSTER_NODE_COUNT = 2
+CLUSTER_LOCATION = Locations.FIN_03
+CLUSTER_IMAGE = 'ubuntu-22.04-cuda-12.4-cluster'
 CLUSTER_CREATED_AT = '2024-01-01T00:00:00Z'
-CLUSTER_MASTER_IP = '10.0.0.1'
-CLUSTER_ENDPOINT = 'cluster-endpoint.verda.com'
+CLUSTER_IP = '10.0.0.1'
 
 NODE_1_ID = 'node1-c0de-a5d2-4972-ae4e-d429115d055b'
 NODE_2_ID = 'node2-c0de-a5d2-4972-ae4e-d429115d055b'
-NODE_3_ID = 'node3-c0de-a5d2-4972-ae4e-d429115d055b'
 
 NODES_PAYLOAD = [
     {
         'id': NODE_1_ID,
-        'instance_type': CLUSTER_INSTANCE_TYPE,
         'status': 'running',
         'hostname': 'test-cluster-node-1',
-        'ip': '10.0.0.2',
-        'created_at': CLUSTER_CREATED_AT,
+        'private_ip': '10.0.0.1',
     },
     {
         'id': NODE_2_ID,
-        'instance_type': CLUSTER_INSTANCE_TYPE,
         'status': 'running',
         'hostname': 'test-cluster-node-2',
-        'ip': '10.0.0.3',
-        'created_at': CLUSTER_CREATED_AT,
-    },
-    {
-        'id': NODE_3_ID,
-        'instance_type': CLUSTER_INSTANCE_TYPE,
-        'status': 'running',
-        'hostname': 'test-cluster-node-3',
-        'ip': '10.0.0.4',
-        'created_at': CLUSTER_CREATED_AT,
+        'private_ip': '10.0.0.2',
     },
 ]
 
 CLUSTER_PAYLOAD = [
     {
         'id': CLUSTER_ID,
-        'name': CLUSTER_NAME,
+        'hostname': CLUSTER_HOSTNAME,
         'description': CLUSTER_DESCRIPTION,
         'status': CLUSTER_STATUS,
         'created_at': CLUSTER_CREATED_AT,
         'location': CLUSTER_LOCATION,
-        'instance_type': CLUSTER_INSTANCE_TYPE,
-        'node_count': CLUSTER_NODE_COUNT,
-        'nodes': NODES_PAYLOAD,
+        'cluster_type': CLUSTER_CLUSTER_TYPE,
+        'worker_nodes': NODES_PAYLOAD,
         'ssh_key_ids': [SSH_KEY_ID],
         'image': CLUSTER_IMAGE,
-        'master_ip': CLUSTER_MASTER_IP,
-        'endpoint': CLUSTER_ENDPOINT,
+        'ip': CLUSTER_IP,
     }
 ]
 
@@ -94,95 +78,20 @@ def test_get_clusters(self, clusters_service, endpoint):
         assert len(clusters) == 1
         assert isinstance(cluster, Cluster)
         assert cluster.id == CLUSTER_ID
-        assert cluster.name == CLUSTER_NAME
+        assert cluster.hostname == CLUSTER_HOSTNAME
         assert cluster.description == CLUSTER_DESCRIPTION
         assert cluster.status == CLUSTER_STATUS
         assert cluster.created_at == CLUSTER_CREATED_AT
         assert cluster.location == CLUSTER_LOCATION
-        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
-        assert cluster.node_count == CLUSTER_NODE_COUNT
-        assert isinstance(cluster.nodes, list)
-        assert len(cluster.nodes) == CLUSTER_NODE_COUNT
-        assert isinstance(cluster.nodes[0], ClusterNode)
+        assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE
+        assert isinstance(cluster.worker_nodes, list)
+        assert len(cluster.worker_nodes) == CLUSTER_NODE_COUNT
+        assert isinstance(cluster.worker_nodes[0], ClusterWorkerNode)
         assert cluster.ssh_key_ids == [SSH_KEY_ID]
         assert cluster.image == CLUSTER_IMAGE
-        assert cluster.master_ip == CLUSTER_MASTER_IP
-        assert cluster.endpoint == CLUSTER_ENDPOINT
+        assert cluster.ip == CLUSTER_IP
         assert responses.assert_call_count(endpoint, 1) is True
 
-    def test_get_clusters_by_status_successful(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '?status=running'
-        responses.add(responses.GET, url, json=CLUSTER_PAYLOAD, status=200)
-
-        # act
-        clusters = clusters_service.get(status='running')
-        cluster = clusters[0]
-
-        # assert
-        assert isinstance(clusters, list)
-        assert len(clusters) == 1
-        assert isinstance(cluster, Cluster)
-        assert cluster.id == CLUSTER_ID
-        assert cluster.status == CLUSTER_STATUS
-        assert responses.assert_call_count(url, 1) is True
-
-    def test_get_clusters_by_status_failed(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '?status=invalid_status'
-        responses.add(
-            responses.GET,
-            url,
-            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
-            status=400,
-        )
-
-        # act
-        with pytest.raises(APIException) as excinfo:
-            clusters_service.get(status='invalid_status')
-
-        # assert
-        assert excinfo.value.code == INVALID_REQUEST
-        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
-        assert responses.assert_call_count(url, 1) is True
-
-    def test_get_cluster_by_id_successful(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '/' + CLUSTER_ID
-        responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200)
-
-        # act
-        cluster = clusters_service.get_by_id(CLUSTER_ID)
-
-        # assert
-        assert isinstance(cluster, Cluster)
-        assert cluster.id == CLUSTER_ID
-        assert cluster.name == CLUSTER_NAME
-        assert cluster.description == CLUSTER_DESCRIPTION
-        assert cluster.status == CLUSTER_STATUS
-        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
-        assert cluster.node_count == CLUSTER_NODE_COUNT
-        assert responses.assert_call_count(url, 1) is True
-
-    def test_get_cluster_by_id_failed(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '/invalid_id'
-        responses.add(
-            responses.GET,
-            url,
-            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
-            status=400,
-        )
-
-        # act
-        with pytest.raises(APIException) as excinfo:
-            clusters_service.get_by_id('invalid_id')
-
-        # assert
-        assert excinfo.value.code == INVALID_REQUEST
-        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
-        assert responses.assert_call_count(url, 1) is True
-
     def test_create_cluster_successful(self, clusters_service, endpoint):
         # arrange - add response mock
         # create cluster
@@ -193,9 +102,8 @@ def test_create_cluster_successful(self, clusters_service, endpoint):
 
         # act
         cluster = clusters_service.create(
-            name=CLUSTER_NAME,
-            instance_type=CLUSTER_INSTANCE_TYPE,
-            node_count=CLUSTER_NODE_COUNT,
+            hostname=CLUSTER_HOSTNAME,
+            cluster_type=CLUSTER_CLUSTER_TYPE,
             image=CLUSTER_IMAGE,
             description=CLUSTER_DESCRIPTION,
             ssh_key_ids=[SSH_KEY_ID],
@@ -205,10 +113,10 @@ def test_create_cluster_successful(self, clusters_service, endpoint):
         # assert
         assert isinstance(cluster, Cluster)
         assert cluster.id == CLUSTER_ID
-        assert cluster.name == CLUSTER_NAME
+        assert cluster.hostname == CLUSTER_HOSTNAME
         assert cluster.description == CLUSTER_DESCRIPTION
         assert cluster.status == CLUSTER_STATUS
-        assert cluster.instance_type == CLUSTER_INSTANCE_TYPE
+        assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE
         assert cluster.node_count == CLUSTER_NODE_COUNT
         assert cluster.ssh_key_ids == [SSH_KEY_ID]
         assert cluster.location == CLUSTER_LOCATION
@@ -228,8 +136,8 @@ def test_create_cluster_failed(self, clusters_service, endpoint):
         # act
         with pytest.raises(APIException) as excinfo:
             clusters_service.create(
-                name=CLUSTER_NAME,
-                instance_type=CLUSTER_INSTANCE_TYPE,
+                name=CLUSTER_HOSTNAME,
+                cluster_type=CLUSTER_CLUSTER_TYPE,
                 node_count=CLUSTER_NODE_COUNT,
                 image=CLUSTER_IMAGE,
                 description=CLUSTER_DESCRIPTION,
@@ -246,7 +154,7 @@ def test_delete_cluster_successful(self, clusters_service, endpoint):
         responses.add(responses.DELETE, url, status=202)
 
         # act
-        result = clusters_service.delete(CLUSTER_ID)
+        result = clusters_service.action(CLUSTER_ID, 'delete')
 
         # assert
         assert result is None
@@ -271,83 +179,3 @@ def test_delete_cluster_failed(self, clusters_service, endpoint):
         assert excinfo.value.message == INVALID_REQUEST_MESSAGE
         assert responses.assert_call_count(url, 1) is True
 
-    def test_scale_cluster_successful(self, clusters_service, endpoint):
-        # arrange - add response mock
-        new_node_count = 5
-        scaled_payload = CLUSTER_PAYLOAD[0].copy()
-        scaled_payload['node_count'] = new_node_count
-
-        # scale endpoint
-        scale_url = endpoint + '/' + CLUSTER_ID + '/scale'
-        responses.add(responses.PUT, scale_url, status=200)
-
-        # get cluster by id
-        get_url = endpoint + '/' + CLUSTER_ID
-        responses.add(responses.GET, get_url, json=scaled_payload, status=200)
-
-        # act
-        cluster = clusters_service.scale(CLUSTER_ID, new_node_count)
-
-        # assert
-        assert isinstance(cluster, Cluster)
-        assert cluster.id == CLUSTER_ID
-        assert cluster.node_count == new_node_count
-        assert responses.assert_call_count(scale_url, 1) is True
-        assert responses.assert_call_count(get_url, 1) is True
-
-    def test_scale_cluster_failed(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '/' + CLUSTER_ID + '/scale'
-        responses.add(
-            responses.PUT,
-            url,
-            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
-            status=400,
-        )
-
-        # act
-        with pytest.raises(APIException) as excinfo:
-            clusters_service.scale(CLUSTER_ID, 5)
-
-        # assert
-        assert excinfo.value.code == INVALID_REQUEST
-        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
-        assert responses.assert_call_count(url, 1) is True
-
-    def test_get_cluster_nodes_successful(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '/' + CLUSTER_ID + '/nodes'
-        responses.add(responses.GET, url, json=NODES_PAYLOAD, status=200)
-
-        # act
-        nodes = clusters_service.get_nodes(CLUSTER_ID)
-
-        # assert
-        assert isinstance(nodes, list)
-        assert len(nodes) == CLUSTER_NODE_COUNT
-        assert isinstance(nodes[0], ClusterNode)
-        assert nodes[0].id == NODE_1_ID
-        assert nodes[0].instance_type == CLUSTER_INSTANCE_TYPE
-        assert nodes[0].status == 'running'
-        assert nodes[1].id == NODE_2_ID
-        assert nodes[2].id == NODE_3_ID
-        assert responses.assert_call_count(url, 1) is True
-
-    def test_get_cluster_nodes_failed(self, clusters_service, endpoint):
-        # arrange - add response mock
-        url = endpoint + '/invalid_id/nodes'
-        responses.add(
-            responses.GET,
-            url,
-            json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
-            status=400,
-        )
-
-        # act
-        with pytest.raises(APIException) as excinfo:
-            clusters_service.get_nodes('invalid_id')
-
-        # assert
-        assert excinfo.value.code == INVALID_REQUEST
-        assert excinfo.value.message == INVALID_REQUEST_MESSAGE
-        assert responses.assert_call_count(url, 1) is True

From 99668e8c7d35ce7083deddc7c6c9d5bb34162afc Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Fri, 19 Dec 2025 17:55:03 +0200
Subject: [PATCH 04/10] fix: polishing

---
 verda/clusters/_clusters.py | 6 +-----
 verda/constants.py          | 7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index c0031c3..1f1011d 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -9,9 +9,6 @@
 
 CLUSTERS_ENDPOINT = '/clusters'
 
-ClusterStatus = Literal[
-    'creating', 'running', 'scaling', 'updating', 'deleting', 'deleted', 'error'
-]
 
 
 @dataclass_json
@@ -131,7 +128,7 @@ def create(
         """Creates and deploys a new compute cluster.
 
         Args:
-            name: Name for the cluster.
+            hostname: Name for the cluster.
             cluster_type: Cluster type.
             image: Image type or ID for cluster nodes.
             description: Human-readable description of the cluster.
@@ -220,7 +217,6 @@ def is_available(
         Returns:
             True if the instance type is available, False otherwise.
         """
-        is_spot = str(is_spot).lower()
         query_params = {'location_code': location_code}
         url = f'/cluster-availability/{cluster_type}'
         return self._http_client.get(url, query_params).json()
diff --git a/verda/constants.py b/verda/constants.py
index a339a42..777e7a7 100644
--- a/verda/constants.py
+++ b/verda/constants.py
@@ -60,12 +60,9 @@ class ClusterStatus:
     """Cluster status."""
 
     ORDERED = 'ordered'
-    CREATING = 'creating'
+    PROVISIONING = 'provisioning'
     RUNNING = 'running'
-    SCALING = 'scaling'
-    UPDATING = 'updating'
-    DELETING = 'deleting'
-    DELETED = 'deleted'
+    DISCONTINUED = 'discontinued'
     ERROR = 'error'
 
     def __init__(self):

From 7f86615c1cd8117e79e1a7dac5d8939329d67329 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 11:08:13 +0200
Subject: [PATCH 05/10] fix: format, lint and unit test fixing

---
 tests/integration_tests/conftest.py        |  2 +-
 tests/unit_tests/clusters/test_clusters.py | 25 +++++++++++-----------
 verda/clusters/__init__.py                 |  2 +-
 verda/clusters/_clusters.py                | 10 ++++++++-
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py
index 78a3260..3f695ab 100644
--- a/tests/integration_tests/conftest.py
+++ b/tests/integration_tests/conftest.py
@@ -9,7 +9,7 @@
 Make sure to run the server and the account has enough balance before running the tests
 """
 
-BASE_URL = os.getenv('VERDA_BASE_URL', 'https://api.verda.com/v1')
+BASE_URL = 'http://localhost:3010/v1'
 
 # Load env variables, make sure there's an env file with valid client credentials
 load_dotenv()
diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py
index 71d7e03..d080259 100644
--- a/tests/unit_tests/clusters/test_clusters.py
+++ b/tests/unit_tests/clusters/test_clusters.py
@@ -1,7 +1,7 @@
 import pytest
 import responses  # https://github.com/getsentry/responses
 
-from verda.clusters import Cluster, ClusterWorkerNode, ClustersService
+from verda.clusters import Cluster, ClustersService, ClusterWorkerNode
 from verda.constants import ErrorCodes, Locations
 from verda.exceptions import APIException
 
@@ -95,7 +95,7 @@ def test_get_clusters(self, clusters_service, endpoint):
     def test_create_cluster_successful(self, clusters_service, endpoint):
         # arrange - add response mock
         # create cluster
-        responses.add(responses.POST, endpoint, body=CLUSTER_ID, status=200)
+        responses.add(responses.POST, endpoint, json={'id': CLUSTER_ID}, status=200)
         # get cluster by id
         url = endpoint + '/' + CLUSTER_ID
         responses.add(responses.GET, url, json=CLUSTER_PAYLOAD[0], status=200)
@@ -117,7 +117,7 @@ def test_create_cluster_successful(self, clusters_service, endpoint):
         assert cluster.description == CLUSTER_DESCRIPTION
         assert cluster.status == CLUSTER_STATUS
         assert cluster.cluster_type == CLUSTER_CLUSTER_TYPE
-        assert cluster.node_count == CLUSTER_NODE_COUNT
+        assert len(cluster.worker_nodes) == CLUSTER_NODE_COUNT
         assert cluster.ssh_key_ids == [SSH_KEY_ID]
         assert cluster.location == CLUSTER_LOCATION
         assert cluster.image == CLUSTER_IMAGE
@@ -136,11 +136,12 @@ def test_create_cluster_failed(self, clusters_service, endpoint):
         # act
         with pytest.raises(APIException) as excinfo:
             clusters_service.create(
-                name=CLUSTER_HOSTNAME,
+                hostname=CLUSTER_HOSTNAME,
                 cluster_type=CLUSTER_CLUSTER_TYPE,
-                node_count=CLUSTER_NODE_COUNT,
                 image=CLUSTER_IMAGE,
                 description=CLUSTER_DESCRIPTION,
+                ssh_key_ids=[SSH_KEY_ID],
+                location=CLUSTER_LOCATION,
             )
 
         # assert
@@ -150,11 +151,11 @@ def test_create_cluster_failed(self, clusters_service, endpoint):
 
     def test_delete_cluster_successful(self, clusters_service, endpoint):
         # arrange - add response mock
-        url = endpoint + '/' + CLUSTER_ID
-        responses.add(responses.DELETE, url, status=202)
+        url = endpoint
+        responses.add(responses.PUT, url, status=202)
 
         # act
-        result = clusters_service.action(CLUSTER_ID, 'delete')
+        result = clusters_service.delete(CLUSTER_ID)
 
         # assert
         assert result is None
@@ -162,10 +163,9 @@ def test_delete_cluster_successful(self, clusters_service, endpoint):
 
     def test_delete_cluster_failed(self, clusters_service, endpoint):
         # arrange - add response mock
-        url = endpoint + '/invalid_id'
         responses.add(
-            responses.DELETE,
-            url,
+            responses.PUT,
+            endpoint,
             json={'code': INVALID_REQUEST, 'message': INVALID_REQUEST_MESSAGE},
             status=400,
         )
@@ -177,5 +177,4 @@ def test_delete_cluster_failed(self, clusters_service, endpoint):
         # assert
         assert excinfo.value.code == INVALID_REQUEST
         assert excinfo.value.message == INVALID_REQUEST_MESSAGE
-        assert responses.assert_call_count(url, 1) is True
-
+        assert responses.assert_call_count(endpoint, 1) is True
diff --git a/verda/clusters/__init__.py b/verda/clusters/__init__.py
index 7ffe932..849bf71 100644
--- a/verda/clusters/__init__.py
+++ b/verda/clusters/__init__.py
@@ -1,5 +1,5 @@
 """Clusters service for managing compute clusters."""
 
-from verda.clusters._clusters import Cluster, ClusterWorkerNode, ClustersService
+from verda.clusters._clusters import Cluster, ClustersService, ClusterWorkerNode
 
 __all__ = ['Cluster', 'ClusterWorkerNode', 'ClustersService']
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index 1f1011d..6fe7037 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -10,7 +10,6 @@
 CLUSTERS_ENDPOINT = '/clusters'
 
 
-
 @dataclass_json
 @dataclass
 class ClusterWorkerNode:
@@ -203,6 +202,15 @@ def action(self, id_list: list[str] | str, action: str) -> None:
         self._http_client.put(CLUSTERS_ENDPOINT, json=payload)
         return
 
+    def delete(self, cluster_id: str) -> None:
+        """Deletes a cluster.
+
+        Args:
+            cluster_id: ID of the cluster to delete.
+        """
+        self.action(cluster_id, 'delete')
+        return
+
     def is_available(
         self,
         cluster_type: str,

From 471e0894385bf51dab228326c4af5b16c533d34a Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 11:59:24 +0200
Subject: [PATCH 06/10] fix: integration tests

---
 tests/integration_tests/conftest.py      |  3 +--
 tests/integration_tests/test_clusters.py | 12 +++++++-----
 verda/clusters/_clusters.py              |  1 -
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py
index 3f695ab..28d36fb 100644
--- a/tests/integration_tests/conftest.py
+++ b/tests/integration_tests/conftest.py
@@ -9,12 +9,11 @@
 Make sure to run the server and the account has enough balance before running the tests
 """
 
-BASE_URL = 'http://localhost:3010/v1'
-
 # Load env variables, make sure there's an env file with valid client credentials
 load_dotenv()
 CLIENT_SECRET = os.getenv('VERDA_CLIENT_SECRET')
 CLIENT_ID = os.getenv('VERDA_CLIENT_ID')
+BASE_URL = os.getenv('VERDA_BASE_URL', 'http://localhost:3010/v1')
 
 
 @pytest.fixture
diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py
index 904f711..3e098af 100644
--- a/tests/integration_tests/test_clusters.py
+++ b/tests/integration_tests/test_clusters.py
@@ -19,9 +19,9 @@ def test_create_cluster(self, verda_client: VerdaClient):
         cluster = verda_client.clusters.create(
             hostname='test-instance',
             location=Locations.FIN_03,
-            cluster_type='16H200',
+            cluster_type='16B200',
             description='test instance',
-            image='ubuntu-22.04-cuda-12.4-cluster',
+            image='ubuntu-22.04-cuda-12.8-cluster',
             ssh_key_ids=[ssh_key.id],
         )
 
@@ -32,9 +32,11 @@ def test_create_cluster(self, verda_client: VerdaClient):
             or cluster.status == verda_client.constants.instance_status.RUNNING
         )
 
-        assert cluster.worker_nodes is not None
-        assert len(cluster.worker_nodes) == 2
-        assert cluster.ip is not None
+        # If still provisioning, we don't have worker nodes yet and ip is not available
+        if cluster.status != verda_client.constants.instance_status.PROVISIONING:
+            assert cluster.worker_nodes is not None
+            assert len(cluster.worker_nodes) == 2
+            assert cluster.ip is not None
 
         print(cluster)
 
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index 6fe7037..7ba684b 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -55,7 +55,6 @@ class Cluster:
     created_at: str
     location: str
     cluster_type: str
-    node_count: int
     worker_nodes: list[ClusterWorkerNode]
     ssh_key_ids: list[str]
     image: str | None = None

From f00901096baea33a813a0ec5b1cd170b3a259f68 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 17:00:02 +0200
Subject: [PATCH 07/10] fix: review fixes

---
 examples/clusters_example.py             | 18 ++++--
 tests/integration_tests/test_clusters.py | 36 ++++++++++--
 verda/clusters/_clusters.py              | 74 +++++++++++++++++++-----
 3 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/examples/clusters_example.py b/examples/clusters_example.py
index b0efdea..8f7696e 100644
--- a/examples/clusters_example.py
+++ b/examples/clusters_example.py
@@ -28,11 +28,20 @@ def create_cluster_example():
     # Get SSH keys
     ssh_keys = [key.id for key in verda.ssh_keys.get()]
 
-    # Create a cluster with 3 nodes
+    # Check if cluster type is available
+    if not verda.clusters.is_available('16B200', Locations.FIN_03):
+        raise ValueError('Cluster type 16B200 is not available in FIN_03')
+
+    # Get available images for cluster type
+    images = verda.clusters.get_cluster_images('16B200')
+    if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
+        raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
+
+    # Create a 16B200 cluster
     cluster = verda.clusters.create(
         hostname='my-compute-cluster',
-        cluster_type='16H200',
-        image='ubuntu-22.04-cuda-12.4-cluster',
+        cluster_type='16B200',
+        image='ubuntu-22.04-cuda-12.9-cluster',
         description='Example compute cluster for distributed training',
         ssh_key_ids=ssh_keys,
         location=Locations.FIN_03,
@@ -40,11 +49,10 @@ def create_cluster_example():
         shared_volume_size=30000,
     )
 
-    print(f'Created cluster: {cluster.id}')
+    print(f'Creating cluster: {cluster.id}')
     print(f'Cluster hostname: {cluster.hostname}')
     print(f'Cluster status: {cluster.status}')
     print(f'Cluster cluster_type: {cluster.cluster_type}')
-    print(f'Cluster worker_nodes: {cluster.worker_nodes}')
     print(f'Location: {cluster.location}')
 
     return cluster
diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py
index 3e098af..ec11e0c 100644
--- a/tests/integration_tests/test_clusters.py
+++ b/tests/integration_tests/test_clusters.py
@@ -1,3 +1,4 @@
+import logging
 import os
 
 import pytest
@@ -5,6 +6,10 @@
 from verda import VerdaClient
 from verda.constants import Locations
 
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger()
+
+
 IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true'
 
 
@@ -15,21 +20,40 @@ def test_create_cluster(self, verda_client: VerdaClient):
         # get ssh key
         ssh_key = verda_client.ssh_keys.get()[0]
 
+        if not verda_client.clusters.is_available('16B200', Locations.FIN_03):
+            raise ValueError('Cluster type 16B200 is not available in FIN_03')
+        logger.debug('[x] Cluster type 16B200 is available in FIN_03')
+
+        availabilities = verda_client.clusters.get_availabilities(Locations.FIN_03)
+        assert len(availabilities) > 0
+        assert '16B200' in availabilities
+        logger.debug(
+            '[x] Cluster type 16B200 is one of the available cluster types in FIN_03: %s',
+            availabilities,
+        )
+
+        images = verda_client.clusters.get_cluster_images('16B200')
+        assert len(images) > 0
+        assert 'ubuntu-22.04-cuda-12.9-cluster' in images
+        logger.debug('[x] Ubuntu 22.04 CUDA 12.9 cluster image is supported for 16B200')
+
         # create instance
         cluster = verda_client.clusters.create(
             hostname='test-instance',
             location=Locations.FIN_03,
             cluster_type='16B200',
             description='test instance',
-            image='ubuntu-22.04-cuda-12.8-cluster',
+            image='ubuntu-22.04-cuda-12.9-cluster',
             ssh_key_ids=[ssh_key.id],
+            # Set to None to not wait for provisioning but return immediately
+            wait_for_status=verda_client.constants.cluster_status.PROVISIONING,
         )
 
         # assert instance is created
         assert cluster.id is not None
         assert (
-            cluster.status == verda_client.constants.instance_status.PROVISIONING
-            or cluster.status == verda_client.constants.instance_status.RUNNING
+            cluster.status == verda_client.constants.cluster_status.PROVISIONING
+            or cluster.status == verda_client.constants.cluster_status.RUNNING
         )
 
         # If still provisioning, we don't have worker nodes yet and ip is not available
@@ -38,7 +62,11 @@ def test_create_cluster(self, verda_client: VerdaClient):
             assert len(cluster.worker_nodes) == 2
             assert cluster.ip is not None
 
-        print(cluster)
+        print(f'Creating cluster: {cluster.id}')
+        print(f'Cluster hostname: {cluster.hostname}')
+        print(f'Cluster status: {cluster.status}')
+        print(f'Cluster cluster_type: {cluster.cluster_type}')
+        print(f'Location: {cluster.location}')
 
         # delete instance
         # verda_client.clusters.action(cluster.id, 'delete')
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index 7ba684b..1c6cc97 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -1,14 +1,17 @@
 import itertools
 import time
 from dataclasses import dataclass
-from typing import Literal
 
 from dataclasses_json import dataclass_json
 
-from verda.constants import Actions, Locations
+from verda.constants import Actions, ClusterStatus, ErrorCodes, Locations
+from verda.exceptions import APIException
 
 CLUSTERS_ENDPOINT = '/clusters'
 
+# Default shared volume size is 30TB
+DEFAULT_SHARED_VOLUME_SIZE = 30000
+
 
 @dataclass_json
 @dataclass
@@ -16,7 +19,7 @@ class ClusterWorkerNode:
     """Represents a worker node in a cluster.
 
     Attributes:
-        id: Unique identifier for the node (instance ID).
+        id: Unique identifier for the node.
         status: Current status of the node.
         hostname: Network hostname of the node.
         private_ip: Private IP address of the node.
@@ -40,7 +43,7 @@ class Cluster:
         status: Current operational status of the cluster.
         created_at: Timestamp of cluster creation.
         location: Datacenter location code (default: Locations.FIN_03).
-        cluster_type: Type of instances used for cluster nodes.
+        cluster_type: Type of the cluster.
         worker_nodes: List of nodes in the cluster.
         ssh_key_ids: List of SSH key IDs associated with the cluster nodes.
         image: Image ID or type used for cluster nodes.
@@ -65,7 +68,7 @@ class Cluster:
 class ClustersService:
     """Service for managing compute clusters through the API.
 
-    This service provides methods to create, retrieve, scale, and manage compute clusters.
+    This service provides methods to create, retrieve, and manage compute clusters.
     """
 
     def __init__(self, http_client) -> None:
@@ -118,6 +121,7 @@ def create(
         shared_volume_name: str | None = None,
         shared_volume_size: int | None = None,
         *,
+        wait_for_status: str | None = ClusterStatus.PROVISIONING,
         max_wait_time: float = 900,
         initial_interval: float = 1.0,
         max_interval: float = 10,
@@ -135,6 +139,7 @@ def create(
             startup_script_id: Optional ID of startup script to run on nodes.
             shared_volume_name: Optional name for the shared volume.
             shared_volume_size: Optional size for the shared volume, in GB, default to 30TB.
+            wait_for_status: Status to wait for the cluster to reach, default to PROVISIONING. If None, no wait is performed.
             max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 900)
             initial_interval: Initial interval, in seconds (default: 1.0)
             max_interval: The longest single delay allowed between retries, in seconds (default: 10)
@@ -158,19 +163,28 @@ def create(
             'startup_script_id': startup_script_id,
             'shared_volume': {
                 'name': shared_volume_name if shared_volume_name else hostname + '-shared-volume',
-                'size': shared_volume_size if shared_volume_size else 30000,
+                'size': shared_volume_size if shared_volume_size else DEFAULT_SHARED_VOLUME_SIZE,
             },
         }
         response = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).json()
         id = response['id']
 
+        if not wait_for_status:
+            return self.get_by_id(id)
+
         # Wait for cluster to enter creating state with timeout
         deadline = time.monotonic() + max_wait_time
         for i in itertools.count():
             cluster = self.get_by_id(id)
-            if cluster.status != 'ordered':
+            if cluster.status == wait_for_status:
                 return cluster
 
+            if cluster.status == ClusterStatus.ERROR:
+                raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} entered error state')
+
+            if cluster.status == ClusterStatus.DISCONTINUED:
+                raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} was discontinued')
+
             now = time.monotonic()
             if now >= deadline:
                 raise TimeoutError(
@@ -181,10 +195,10 @@ def create(
             time.sleep(interval)
 
     def action(self, id_list: list[str] | str, action: str) -> None:
-        """Performs an action on one or more instances.
+        """Performs an action on one or more clusters.
 
         Args:
-            id_list: Single instance ID or list of instance IDs to act upon.
+            id_list: Single cluster ID or list of cluster IDs to act upon.
             action: Action to perform on the clusters. Only `delete` is supported.
 
         Raises:
@@ -215,20 +229,21 @@ def is_available(
         cluster_type: str,
         location_code: str | None = None,
     ) -> bool:
-        """Checks if a specific instance type is available for deployment.
+        """Checks if a specific cluster type is available for deployment.
 
         Args:
-            cluster_type: Type of instance to check availability for.
+            cluster_type: Type of cluster to check availability for.
             location_code: Optional datacenter location code.
 
         Returns:
-            True if the instance type is available, False otherwise.
+            True if the cluster type is available, False otherwise.
         """
         query_params = {'location_code': location_code}
         url = f'/cluster-availability/{cluster_type}'
-        return self._http_client.get(url, query_params).json()
+        response = self._http_client.get(url, query_params).text
+        return response == 'true'
 
-    def get_availabilities(self, location_code: str | None = None) -> list[dict]:
+    def get_availabilities(self, location_code: str | None = None) -> list[str]:
         """Retrieves a list of available cluster types across locations.
 
         Args:
@@ -238,4 +253,33 @@ def get_availabilities(self, location_code: str | None = None) -> list[dict]:
             List of available cluster types and their details.
         """
         query_params = {'location_code': location_code}
-        return self._http_client.get('/cluster-availability', params=query_params).json()
+        response = self._http_client.get('/cluster-availability', params=query_params).json()
+        availabilities = response[0]['availabilities']
+        return availabilities
+
+    def get_availability(self, cluster_type: str, location_code: str | None = None) -> list[dict]:
+        """Checks if a specific cluster type is available for deployment.
+
+        Args:
+            cluster_type: Type of cluster to check availability for.
+            location_code: Optional datacenter location code.
+
+        Returns:
+            True if the cluster type is available, False otherwise.
+        """
+
+    def get_cluster_images(
+        self,
+        cluster_type: str | None = None,
+    ) -> list[str]:
+        """Retrieves a list of available images for a given cluster type (optional).
+
+        Args:
+            cluster_type: Type of cluster to get images for.
+
+        Returns:
+            List of available images for the given cluster type.
+        """
+        query_params = {'instance_type': cluster_type}
+        images = self._http_client.get('/images/cluster', params=query_params).json()
+        return [image['image_type'] for image in images]

From d2c3a042c912bc4ff3d94d05669edbef5822ab53 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 17:37:53 +0200
Subject: [PATCH 08/10] fix: full features cluster example, add integration
 test

---
 examples/clusters_example.py             | 19 +++++++++++++++----
 tests/integration_tests/test_clusters.py | 13 +++++--------
 verda/clusters/_clusters.py              | 10 +++++-----
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/examples/clusters_example.py b/examples/clusters_example.py
index 8f7696e..9c76149 100644
--- a/examples/clusters_example.py
+++ b/examples/clusters_example.py
@@ -6,11 +6,11 @@
 - List all clusters
 - Get a specific cluster by ID
 - Get cluster nodes
-- Scale a cluster
 - Delete a cluster
 """
 
 import os
+import time
 
 from verda import VerdaClient
 from verda.constants import Actions, Locations
@@ -18,9 +18,10 @@
 # Get credentials from environment variables
 CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
 CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
+BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1')
 
 # Create client
-verda = VerdaClient(CLIENT_ID, CLIENT_SECRET)
+verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL)
 
 
 def create_cluster_example():
@@ -34,19 +35,20 @@ def create_cluster_example():
 
     # Get available images for cluster type
     images = verda.clusters.get_cluster_images('16B200')
-    if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
+    if 'ubuntu-22.04-cuda-12.4-cluster' not in images:
         raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
 
     # Create a 16B200 cluster
     cluster = verda.clusters.create(
         hostname='my-compute-cluster',
         cluster_type='16B200',
-        image='ubuntu-22.04-cuda-12.9-cluster',
+        image='ubuntu-22.04-cuda-12.4-cluster',
         description='Example compute cluster for distributed training',
         ssh_key_ids=ssh_keys,
         location=Locations.FIN_03,
         shared_volume_name='my-shared-volume',
         shared_volume_size=30000,
+        wait_for_status=None,
     )
 
     print(f'Creating cluster: {cluster.id}')
@@ -55,6 +57,15 @@ def create_cluster_example():
     print(f'Cluster cluster_type: {cluster.cluster_type}')
     print(f'Location: {cluster.location}')
 
+    # Wait for cluster to enter RUNNING status
+    while cluster.status != verda.constants.cluster_status.RUNNING:
+        time.sleep(2)
+        print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
+        cluster = verda.clusters.get_by_id(cluster.id)
+
+    print(f'Public IP: {cluster.ip}')
+    print('Cluster is now running and ready to use!')
+
     return cluster
 
 
diff --git a/tests/integration_tests/test_clusters.py b/tests/integration_tests/test_clusters.py
index ec11e0c..0ab6d06 100644
--- a/tests/integration_tests/test_clusters.py
+++ b/tests/integration_tests/test_clusters.py
@@ -62,11 +62,8 @@ def test_create_cluster(self, verda_client: VerdaClient):
             assert len(cluster.worker_nodes) == 2
             assert cluster.ip is not None
 
-        print(f'Creating cluster: {cluster.id}')
-        print(f'Cluster hostname: {cluster.hostname}')
-        print(f'Cluster status: {cluster.status}')
-        print(f'Cluster cluster_type: {cluster.cluster_type}')
-        print(f'Location: {cluster.location}')
-
-        # delete instance
-        # verda_client.clusters.action(cluster.id, 'delete')
+        # Now we need to wait for RUNNING status to connect to the jumphost (public IP is available)
+        # After that, we can connect to the jumphost and run commands on the cluster nodes:
+        #
+        # ssh -i ssh_key.pem root@<public_ip>
+        #
diff --git a/verda/clusters/_clusters.py b/verda/clusters/_clusters.py
index 1c6cc97..8558db3 100644
--- a/verda/clusters/_clusters.py
+++ b/verda/clusters/_clusters.py
@@ -204,13 +204,13 @@ def action(self, id_list: list[str] | str, action: str) -> None:
         Raises:
             HTTPError: If the action fails or other API error occurs.
         """
-        if type(id_list) is str:
-            id_list = [id_list]
+        if action != Actions.DELETE:
+            raise ValueError(f'Invalid action: {action}. Only DELETE is supported.')
 
-        if action == Actions.DELETE:
-            payload = {'id': id_list, 'action': 'discontinue'}
+        if type(id_list) is str:
+            payload = {'actions': [{'id': id_list, 'action': 'discontinue'}]}
         else:
-            raise ValueError(f'Invalid action: {action}. Only DELETE is supported.')
+            payload = {'actions': [{'id': id, 'action': action} for id in id_list]}
 
         self._http_client.put(CLUSTERS_ENDPOINT, json=payload)
         return

From f5c275c1323c5f535510da23dfd11aa986097174 Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 17:39:55 +0200
Subject: [PATCH 09/10] fix: unit tests

---
 tests/unit_tests/clusters/test_clusters.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit_tests/clusters/test_clusters.py b/tests/unit_tests/clusters/test_clusters.py
index d080259..a69ae03 100644
--- a/tests/unit_tests/clusters/test_clusters.py
+++ b/tests/unit_tests/clusters/test_clusters.py
@@ -108,6 +108,7 @@ def test_create_cluster_successful(self, clusters_service, endpoint):
             description=CLUSTER_DESCRIPTION,
             ssh_key_ids=[SSH_KEY_ID],
             location=CLUSTER_LOCATION,
+            wait_for_status=CLUSTER_STATUS,
         )
 
         # assert

From 7f9a1c5cf3662d3cd6bf58d1034bd9e24ce1968a Mon Sep 17 00:00:00 2001
From: Ruslan Gainutdinov <ruslan@datacrunch.io>
Date: Mon, 22 Dec 2025 18:01:06 +0200
Subject: [PATCH 10/10] fix: revert to the correct OS images in prod

---
 examples/clusters_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/clusters_example.py b/examples/clusters_example.py
index 9c76149..8aeadf0 100644
--- a/examples/clusters_example.py
+++ b/examples/clusters_example.py
@@ -35,14 +35,14 @@ def create_cluster_example():
 
     # Get available images for cluster type
     images = verda.clusters.get_cluster_images('16B200')
-    if 'ubuntu-22.04-cuda-12.4-cluster' not in images:
+    if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
         raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
 
     # Create a 16B200 cluster
     cluster = verda.clusters.create(
         hostname='my-compute-cluster',
         cluster_type='16B200',
-        image='ubuntu-22.04-cuda-12.4-cluster',
+        image='ubuntu-22.04-cuda-12.9-cluster',
         description='Example compute cluster for distributed training',
         ssh_key_ids=ssh_keys,
         location=Locations.FIN_03,