Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 121 additions & 1 deletion debug_gym/gym/terminals/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,114 @@ def _clean_for_kubernetes(name: str) -> str:
return cleaned[:253]


# Preset definitions for common Kubernetes configurations
AFFINITY_PRESETS = {
"same_host": lambda hostname_key: {
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "kubernetes.io/hostname",
"operator": "In",
# Use Jinja2 template syntax: {{HOSTNAME}} -> value from env var
"values": ["{{" + hostname_key + "}}"],
}
]
}
]
}
}
},
}

TOLERATION_PRESETS = {
"spot": {
"key": "kubernetes.azure.com/scalesetpriority",
"operator": "Equal",
"value": "spot",
"effect": "NoSchedule",
},
"azure_spot": {
"key": "kubernetes.azure.com/scalesetpriority",
"operator": "Equal",
"value": "spot",
"effect": "NoSchedule",
},
"critical": {
"key": "CriticalAddonsOnly",
"operator": "Equal",
"value": "true",
"effect": "NoSchedule",
},
}


def _build_pod_spec_from_shortcuts(
affinity_mode: str | None = None,
affinity_hostname_key: str = "HOSTNAME",
tolerations_preset: str | list[str] | None = None,
) -> dict:
"""Build pod_spec_kwargs from convenient shortcut parameters.

Args:
affinity_mode: Affinity preset name. Currently supports:
- "same_host": Schedule pod on a specific host identified by the
environment variable specified in affinity_hostname_key.
affinity_hostname_key: Environment variable name containing the hostname
for "same_host" affinity mode. Defaults to "HOSTNAME".
tolerations_preset: Toleration preset name(s). Can be a single string
or a list of strings. Currently supports:
- "spot" or "azure_spot": Tolerate Azure spot instance nodes
- "critical": Tolerate CriticalAddonsOnly taint

Returns:
A dictionary suitable for use as pod_spec_kwargs.
"""
pod_spec = {}

# Build affinity from preset
if affinity_mode:
if affinity_mode not in AFFINITY_PRESETS:
raise ValueError(
f"Unknown affinity_mode '{affinity_mode}'. "
f"Available modes: {list(AFFINITY_PRESETS.keys())}"
)
pod_spec["affinity"] = AFFINITY_PRESETS[affinity_mode](affinity_hostname_key)

# Build tolerations from preset(s)
if tolerations_preset:
if isinstance(tolerations_preset, str):
tolerations_preset = [tolerations_preset]

tolerations = []
for preset in tolerations_preset:
if preset not in TOLERATION_PRESETS:
raise ValueError(
f"Unknown tolerations_preset '{preset}'. "
f"Available presets: {list(TOLERATION_PRESETS.keys())}"
)
tolerations.append(TOLERATION_PRESETS[preset])
pod_spec["tolerations"] = tolerations

return pod_spec


def _deep_merge_dicts(base: dict, override: dict) -> dict:
"""Deep merge two dictionaries, with override taking precedence.

For nested dictionaries, merges recursively. For lists, override replaces base.
"""
result = base.copy()
for key, value in override.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = _deep_merge_dicts(result[key], value)
else:
result[key] = value
return result


class Pod:
def __init__(
self, k8s_client: client.CoreV1Api, pod_body: dict, logger: DebugGymLogger
Expand Down Expand Up @@ -277,6 +385,10 @@ def __init__(
kube_context: str | None = None,
extra_labels: dict | None = None,
pod_spec_kwargs: dict = None,
# Convenient shortcut parameters for common pod spec configurations
affinity_mode: str | None = None,
affinity_hostname_key: str = "HOSTNAME",
tolerations_preset: str | list[str] | None = None,
**kwargs,
):
super().__init__(
Expand All @@ -297,7 +409,15 @@ def __init__(
self.kubernetes_kwargs = kwargs # e.g., nodeSelector, tolerations
self.registry = registry.rstrip("/") + "/" if registry else ""
self._pod_name = pod_name
self.pod_spec_kwargs = pod_spec_kwargs or {}

# Build pod_spec_kwargs from shortcuts and merge with explicit pod_spec_kwargs
shortcut_spec = _build_pod_spec_from_shortcuts(
affinity_mode=affinity_mode,
affinity_hostname_key=affinity_hostname_key,
tolerations_preset=tolerations_preset,
)
explicit_spec = pod_spec_kwargs or {}
self.pod_spec_kwargs = _deep_merge_dicts(shortcut_spec, explicit_spec)
user = _clean_for_kubernetes(os.environ.get("USER", "unknown"))
self.labels = {"app": "dbg-gym", "user": user} | (extra_labels or {})
self._pod = None
Expand Down
172 changes: 172 additions & 0 deletions tests/gym/terminals/test_kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import platform
import subprocess
import time
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -321,3 +323,173 @@ def test_kubernetes_terminal_readonly_properties_after_pod_creation():
terminal.working_dir = "/new/path"

terminal.close()


# Tests for pod spec shortcut functionality (don't require Kubernetes cluster)


def test_build_pod_spec_from_shortcuts_affinity_same_host():
"""Test building pod spec with same_host affinity mode."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

spec = _build_pod_spec_from_shortcuts(
affinity_mode="same_host", affinity_hostname_key="MY_HOST"
)

assert "affinity" in spec
node_affinity = spec["affinity"]["nodeAffinity"]
required = node_affinity["requiredDuringSchedulingIgnoredDuringExecution"]
term = required["nodeSelectorTerms"][0]
match_expr = term["matchExpressions"][0]

assert match_expr["key"] == "kubernetes.io/hostname"
assert match_expr["operator"] == "In"
assert match_expr["values"] == ["{{MY_HOST}}"]


def test_build_pod_spec_from_shortcuts_tolerations_single():
"""Test building pod spec with a single toleration preset."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

spec = _build_pod_spec_from_shortcuts(tolerations_preset="spot")

assert "tolerations" in spec
assert len(spec["tolerations"]) == 1
assert spec["tolerations"][0]["key"] == "kubernetes.azure.com/scalesetpriority"
assert spec["tolerations"][0]["value"] == "spot"


def test_build_pod_spec_from_shortcuts_tolerations_multiple():
"""Test building pod spec with multiple toleration presets."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

spec = _build_pod_spec_from_shortcuts(tolerations_preset=["spot", "critical"])

assert "tolerations" in spec
assert len(spec["tolerations"]) == 2

keys = [t["key"] for t in spec["tolerations"]]
assert "kubernetes.azure.com/scalesetpriority" in keys
assert "CriticalAddonsOnly" in keys


def test_build_pod_spec_from_shortcuts_combined():
"""Test building pod spec with both affinity and tolerations."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

spec = _build_pod_spec_from_shortcuts(
affinity_mode="same_host",
tolerations_preset=["spot", "critical"],
)

assert "affinity" in spec
assert "tolerations" in spec
assert len(spec["tolerations"]) == 2


def test_build_pod_spec_from_shortcuts_invalid_affinity_mode():
"""Test that invalid affinity_mode raises ValueError."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

with pytest.raises(ValueError, match="Unknown affinity_mode 'invalid'"):
_build_pod_spec_from_shortcuts(affinity_mode="invalid")


def test_build_pod_spec_from_shortcuts_invalid_tolerations_preset():
"""Test that invalid tolerations_preset raises ValueError."""
from debug_gym.gym.terminals.kubernetes import _build_pod_spec_from_shortcuts

with pytest.raises(ValueError, match="Unknown tolerations_preset 'invalid'"):
_build_pod_spec_from_shortcuts(tolerations_preset="invalid")


def test_deep_merge_dicts():
"""Test deep merging of dictionaries."""
from debug_gym.gym.terminals.kubernetes import _deep_merge_dicts

base = {
"a": 1,
"b": {"c": 2, "d": 3},
"e": [1, 2],
}
override = {
"b": {"c": 10, "f": 4},
"e": [3, 4],
"g": 5,
}

result = _deep_merge_dicts(base, override)

assert result["a"] == 1 # Unchanged from base
assert result["b"]["c"] == 10 # Override wins
assert result["b"]["d"] == 3 # Preserved from base
assert result["b"]["f"] == 4 # Added from override
assert result["e"] == [3, 4] # Override replaces list
assert result["g"] == 5 # Added from override


@patch("debug_gym.gym.terminals.kubernetes.config.load_kube_config")
@patch("debug_gym.gym.terminals.kubernetes.client.CoreV1Api")
def test_kubernetes_terminal_with_affinity_mode(mock_api, mock_config, monkeypatch):
"""Test KubernetesTerminal initialization with affinity_mode shortcut."""
monkeypatch.setenv("MY_NODE", "test-node-1")
terminal = KubernetesTerminal(
base_image="ubuntu:latest",
affinity_mode="same_host",
affinity_hostname_key="MY_NODE",
)

assert "affinity" in terminal.pod_spec_kwargs
node_affinity = terminal.pod_spec_kwargs["affinity"]["nodeAffinity"]
required = node_affinity["requiredDuringSchedulingIgnoredDuringExecution"]
term = required["nodeSelectorTerms"][0]
match_expr = term["matchExpressions"][0]
assert match_expr["values"] == ["{{MY_NODE}}"]


@patch("debug_gym.gym.terminals.kubernetes.config.load_kube_config")
@patch("debug_gym.gym.terminals.kubernetes.client.CoreV1Api")
def test_kubernetes_terminal_with_tolerations_preset(mock_api, mock_config):
"""Test KubernetesTerminal initialization with tolerations_preset shortcut."""
terminal = KubernetesTerminal(
base_image="ubuntu:latest",
tolerations_preset=["spot", "critical"],
)

assert "tolerations" in terminal.pod_spec_kwargs
assert len(terminal.pod_spec_kwargs["tolerations"]) == 2


@patch("debug_gym.gym.terminals.kubernetes.config.load_kube_config")
@patch("debug_gym.gym.terminals.kubernetes.client.CoreV1Api")
def test_kubernetes_terminal_shortcuts_with_explicit_pod_spec_kwargs(
mock_api, mock_config
):
"""Test that explicit pod_spec_kwargs override shortcut-generated values."""
custom_tolerations = [{"key": "custom", "value": "value", "effect": "NoSchedule"}]
terminal = KubernetesTerminal(
base_image="ubuntu:latest",
tolerations_preset="spot", # This would normally add spot toleration
pod_spec_kwargs={"tolerations": custom_tolerations}, # But explicit overrides
)

# Explicit pod_spec_kwargs should override the shortcut-generated tolerations
assert terminal.pod_spec_kwargs["tolerations"] == custom_tolerations


@patch("debug_gym.gym.terminals.kubernetes.config.load_kube_config")
@patch("debug_gym.gym.terminals.kubernetes.client.CoreV1Api")
def test_kubernetes_terminal_shortcuts_merge_with_pod_spec_kwargs(
mock_api, mock_config
):
"""Test that shortcuts and pod_spec_kwargs are properly merged."""
terminal = KubernetesTerminal(
base_image="ubuntu:latest",
affinity_mode="same_host", # Adds affinity
pod_spec_kwargs={"nodeSelector": {"disktype": "ssd"}}, # Adds nodeSelector
)

# Both should be present
assert "affinity" in terminal.pod_spec_kwargs
assert "nodeSelector" in terminal.pod_spec_kwargs
assert terminal.pod_spec_kwargs["nodeSelector"]["disktype"] == "ssd"
Loading