Add RuntimeClass support for unlimited RLIMIT_MEMLOCK

The previous approach of mounting cri-base.json into kind nodes failed because we didn't tell containerd to use it via containerdConfigPatches. RuntimeClass allows different stacks to have different rlimit profiles, which is essential since kind only supports one cluster per host and multiple stacks share the same cluster. Changes: - Add containerdConfigPatches to kind-config.yml to define runtime handlers - Create RuntimeClass resources after cluster creation - Add runtimeClassName to pod specs based on stack's security settings - Rename cri-base.json to high-memlock-spec.json for clarity - Add get_runtime_class() method to Spec that auto-derives from unlimited-memlock setting Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 01:58:38 -05:00 · 2026-01-22 01:58:38 -05:00 · 87db167d7f
parent dd856af2d3
commit 87db167d7f
5 changed files with 134 additions and 25 deletions
--- a/stack_orchestrator/constants.py
+++ b/stack_orchestrator/constants.py
@ -41,3 +41,6 @@ kind_config_filename = "kind-config.yml"
 kube_config_filename = "kubeconfig.yml"
 cri_base_filename = "cri-base.json"
 unlimited_memlock_key = "unlimited-memlock"
+runtime_class_key = "runtime-class"
+high_memlock_runtime = "high-memlock"
+high_memlock_spec_filename = "high-memlock-spec.json"
--- a/stack_orchestrator/deploy/k8s/cluster_info.py
+++ b/stack_orchestrator/deploy/k8s/cluster_info.py
@ -531,6 +531,7 @@ class ClusterInfo:
                volumes=volumes,
                affinity=affinity,
                tolerations=tolerations,
+                runtime_class_name=self.spec.get_runtime_class(),
            ),
        )
        spec = client.V1DeploymentSpec(
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@ -37,7 +37,7 @@ from stack_orchestrator.deploy.k8s.helpers import (
 )
 from stack_orchestrator.deploy.k8s.helpers import (
    generate_kind_config,
-    generate_cri_base_json,
+    generate_high_memlock_spec_json,
 )
 from stack_orchestrator.deploy.k8s.cluster_info import ClusterInfo
 from stack_orchestrator.opts import opts
@ -59,6 +59,36 @@ def _check_delete_exception(e: ApiException) -> None:
        error_exit(f"k8s api error: {e}")


+def _create_runtime_class(name: str, handler: str):
+    """Create a RuntimeClass resource for custom containerd runtime handlers.
+
+    RuntimeClass allows pods to specify which runtime handler to use, enabling
+    different pods to have different rlimit profiles (e.g., high-memlock).
+
+    Args:
+        name: The name of the RuntimeClass resource
+        handler: The containerd runtime handler name
+            (must match containerdConfigPatches)
+    """
+    api = client.NodeV1Api()
+    runtime_class = client.V1RuntimeClass(
+        api_version="node.k8s.io/v1",
+        kind="RuntimeClass",
+        metadata=client.V1ObjectMeta(name=name),
+        handler=handler,
+    )
+    try:
+        api.create_runtime_class(runtime_class)
+        if opts.o.debug:
+            print(f"Created RuntimeClass: {name}")
+    except ApiException as e:
+        if e.status == 409:  # Already exists
+            if opts.o.debug:
+                print(f"RuntimeClass {name} already exists")
+        else:
+            raise
+
+
 class K8sDeployer(Deployer):
    name: str = "k8s"
    type: str
@ -275,6 +305,12 @@ class K8sDeployer(Deployer):
                # Wait for ingress to start
                # (deployment provisioning will fail unless this is done)
                wait_for_ingress_in_kind()
+                # Create RuntimeClass if unlimited_memlock is enabled
+                if self.cluster_info.spec.get_unlimited_memlock():
+                    _create_runtime_class(
+                        constants.high_memlock_runtime,
+                        constants.high_memlock_runtime,
+                    )

        else:
            print("Dry run mode enabled, skipping k8s API connect")
@ -669,17 +705,19 @@ class K8sDeployerConfigGenerator(DeployerConfigGenerator):
    def generate(self, deployment_dir: Path):
        # No need to do this for the remote k8s case
        if self.type == "k8s-kind":
-            # Generate cri-base.json if unlimited_memlock is enabled.
+            # Generate high-memlock-spec.json if unlimited_memlock is enabled.
            # Must be done before generate_kind_config() which references it.
            if self.deployment_context.spec.get_unlimited_memlock():
-                cri_base_content = generate_cri_base_json()
-                cri_base_file = deployment_dir.joinpath(constants.cri_base_filename)
+                spec_content = generate_high_memlock_spec_json()
+                spec_file = deployment_dir.joinpath(
+                    constants.high_memlock_spec_filename
+                )
                if opts.o.debug:
                    print(
-                        f"Creating cri-base.json for unlimited memlock: {cri_base_file}"
+                        f"Creating high-memlock spec for unlimited memlock: {spec_file}"
                    )
-                with open(cri_base_file, "w") as output_file:
-                    output_file.write(cri_base_content)
+                with open(spec_file, "w") as output_file:
+                    output_file.write(spec_content)

            # Check the file isn't already there
            # Get the config file contents
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@ -317,17 +317,19 @@ def _generate_kind_port_mappings(parsed_pod_files):
    )


-def _generate_cri_base_mount(deployment_dir: Path):
-    """Generate the extraMount entry for cri-base.json to set RLIMIT_MEMLOCK."""
-    cri_base_path = deployment_dir.joinpath(constants.cri_base_filename).resolve()
-    return (
-        f"  - hostPath: {cri_base_path}\n"
-        f"    containerPath: /etc/containerd/cri-base.json\n"
-    )
+def _generate_high_memlock_spec_mount(deployment_dir: Path):
+    """Generate the extraMount entry for high-memlock-spec.json.
+
+    The spec file must be mounted at the same path inside the kind node
+    as it appears on the host, because containerd's base_runtime_spec
+    references an absolute path.
+    """
+    spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve()
+    return f"  - hostPath: {spec_path}\n" f"    containerPath: {spec_path}\n"


-def generate_cri_base_json():
-    """Generate cri-base.json content with unlimited RLIMIT_MEMLOCK.
+def generate_high_memlock_spec_json():
+    """Generate OCI spec JSON with unlimited RLIMIT_MEMLOCK.

    This is needed for workloads like Solana validators that require large
    amounts of locked memory for memory-mapped files during snapshot decompression.
@ -339,7 +341,7 @@ def generate_cri_base_json():

    # Use maximum 64-bit signed integer value for unlimited
    max_rlimit = 9223372036854775807
-    cri_base = {
+    spec = {
        "ociVersion": "1.0.2-dev",
        "process": {
            "rlimits": [
@ -348,7 +350,36 @@ def generate_cri_base_json():
            ]
        },
    }
-    return json.dumps(cri_base, indent=2)
+    return json.dumps(spec, indent=2)
+
+
+# Keep old name as alias for backward compatibility
+def generate_cri_base_json():
+    """Deprecated: Use generate_high_memlock_spec_json() instead."""
+    return generate_high_memlock_spec_json()
+
+
+def _generate_containerd_config_patches(
+    deployment_dir: Path, has_high_memlock: bool
+) -> str:
+    """Generate containerdConfigPatches YAML for custom runtime handlers.
+
+    This configures containerd to have a runtime handler named 'high-memlock'
+    that uses a custom OCI base spec with unlimited RLIMIT_MEMLOCK.
+    """
+    if not has_high_memlock:
+        return ""
+
+    spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve()
+    runtime_name = constants.high_memlock_runtime
+    plugin_path = 'plugins."io.containerd.grpc.v1.cri".containerd.runtimes'
+    return (
+        "containerdConfigPatches:\n"
+        "  - |-\n"
+        f"    [{plugin_path}.{runtime_name}]\n"
+        '      runtime_type = "io.containerd.runc.v2"\n'
+        f'      base_runtime_spec = "{spec_path}"\n'
+    )


 # Note: this makes any duplicate definition in b overwrite a
@ -430,19 +461,30 @@ def generate_kind_config(deployment_dir: Path, deployment_context):
        parsed_pod_files_map, deployment_dir, deployment_context
    )

-    # Check if unlimited_memlock is enabled and add cri-base.json mount
+    # Check if unlimited_memlock is enabled
    unlimited_memlock = deployment_context.spec.get_unlimited_memlock()
+
+    # Generate containerdConfigPatches for RuntimeClass support
+    containerd_patches_yml = _generate_containerd_config_patches(
+        deployment_dir, unlimited_memlock
+    )
+
+    # Add high-memlock spec file mount if needed
    if unlimited_memlock:
-        cri_base_mount = _generate_cri_base_mount(deployment_dir)
+        spec_mount = _generate_high_memlock_spec_mount(deployment_dir)
        if mounts_yml:
            # Append to existing mounts
-            mounts_yml = mounts_yml.rstrip() + "\n" + cri_base_mount
+            mounts_yml = mounts_yml.rstrip() + "\n" + spec_mount
        else:
-            mounts_yml = f"  extraMounts:\n{cri_base_mount}"
+            mounts_yml = f"  extraMounts:\n{spec_mount}"

-    return (
-        "kind: Cluster\n"
-        "apiVersion: kind.x-k8s.io/v1alpha4\n"
+    # Build the config - containerdConfigPatches must be at cluster level (before nodes)
+    config = "kind: Cluster\n" "apiVersion: kind.x-k8s.io/v1alpha4\n"
+
+    if containerd_patches_yml:
+        config += containerd_patches_yml
+
+    config += (
        "nodes:\n"
        "- role: control-plane\n"
        "  kubeadmConfigPatches:\n"
@ -454,3 +496,5 @@ def generate_kind_config(deployment_dir: Path, deployment_context):
        f"{port_mappings_yml}\n"
        f"{mounts_yml}\n"
    )
+
+    return config
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@ -153,6 +153,29 @@ class Spec:
            ).lower()
        )

+    def get_runtime_class(self):
+        """Get runtime class name from spec, or derive from security settings.
+
+        The runtime class determines which containerd runtime handler to use,
+        allowing different pods to have different rlimit profiles (e.g., for
+        unlimited RLIMIT_MEMLOCK).
+
+        Returns:
+            Runtime class name string, or None to use default runtime.
+        """
+        # Explicit runtime class takes precedence
+        explicit = self.obj.get(constants.security_key, {}).get(
+            constants.runtime_class_key, None
+        )
+        if explicit:
+            return explicit
+
+        # Auto-derive from unlimited-memlock setting
+        if self.get_unlimited_memlock():
+            return constants.high_memlock_runtime
+
+        return None  # Use default runtime
+
    def get_deployment_type(self):
        return self.obj.get(constants.deploy_to_key)