From 87db167d7f9e113444ff6939580e0ed0bb2bfd8a Mon Sep 17 00:00:00 2001
From: "A. F. Dudley" <a.frederick.dudley@gmail.com>
Date: Thu, 22 Jan 2026 01:58:38 -0500
Subject: [PATCH] Add RuntimeClass support for unlimited RLIMIT_MEMLOCK

The previous approach of mounting cri-base.json into kind nodes failed
because we didn't tell containerd to use it via containerdConfigPatches.

RuntimeClass allows different stacks to have different rlimit profiles,
which is essential since kind only supports one cluster per host and
multiple stacks share the same cluster.

Changes:
- Add containerdConfigPatches to kind-config.yml to define runtime handlers
- Create RuntimeClass resources after cluster creation
- Add runtimeClassName to pod specs based on stack's security settings
- Rename cri-base.json to high-memlock-spec.json for clarity
- Add get_runtime_class() method to Spec that auto-derives from
  unlimited-memlock setting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 stack_orchestrator/constants.py               |  3 +
 stack_orchestrator/deploy/k8s/cluster_info.py |  1 +
 stack_orchestrator/deploy/k8s/deploy_k8s.py   | 52 ++++++++++--
 stack_orchestrator/deploy/k8s/helpers.py      | 80 ++++++++++++++-----
 stack_orchestrator/deploy/spec.py             | 23 ++++++
 5 files changed, 134 insertions(+), 25 deletions(-)

diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py
index 322b57eb..49dfa193 100644
--- a/stack_orchestrator/constants.py
+++ b/stack_orchestrator/constants.py
@@ -41,3 +41,6 @@ kind_config_filename = "kind-config.yml"
 kube_config_filename = "kubeconfig.yml"
 cri_base_filename = "cri-base.json"
 unlimited_memlock_key = "unlimited-memlock"
+runtime_class_key = "runtime-class"
+high_memlock_runtime = "high-memlock"
+high_memlock_spec_filename = "high-memlock-spec.json"
diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py
index bd539e30..97a5651f 100644
--- a/stack_orchestrator/deploy/k8s/cluster_info.py
+++ b/stack_orchestrator/deploy/k8s/cluster_info.py
@@ -531,6 +531,7 @@ class ClusterInfo:
                 volumes=volumes,
                 affinity=affinity,
                 tolerations=tolerations,
+                runtime_class_name=self.spec.get_runtime_class(),
             ),
         )
         spec = client.V1DeploymentSpec(
diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py
index 38867dab..cf8f564f 100644
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@@ -37,7 +37,7 @@ from stack_orchestrator.deploy.k8s.helpers import (
 )
 from stack_orchestrator.deploy.k8s.helpers import (
     generate_kind_config,
-    generate_cri_base_json,
+    generate_high_memlock_spec_json,
 )
 from stack_orchestrator.deploy.k8s.cluster_info import ClusterInfo
 from stack_orchestrator.opts import opts
@@ -59,6 +59,36 @@ def _check_delete_exception(e: ApiException) -> None:
         error_exit(f"k8s api error: {e}")
 
 
+def _create_runtime_class(name: str, handler: str):
+    """Create a RuntimeClass resource for custom containerd runtime handlers.
+
+    RuntimeClass allows pods to specify which runtime handler to use, enabling
+    different pods to have different rlimit profiles (e.g., high-memlock).
+
+    Args:
+        name: The name of the RuntimeClass resource
+        handler: The containerd runtime handler name
+            (must match containerdConfigPatches)
+    """
+    api = client.NodeV1Api()
+    runtime_class = client.V1RuntimeClass(
+        api_version="node.k8s.io/v1",
+        kind="RuntimeClass",
+        metadata=client.V1ObjectMeta(name=name),
+        handler=handler,
+    )
+    try:
+        api.create_runtime_class(runtime_class)
+        if opts.o.debug:
+            print(f"Created RuntimeClass: {name}")
+    except ApiException as e:
+        if e.status == 409:  # Already exists
+            if opts.o.debug:
+                print(f"RuntimeClass {name} already exists")
+        else:
+            raise
+
+
 class K8sDeployer(Deployer):
     name: str = "k8s"
     type: str
@@ -275,6 +305,12 @@ class K8sDeployer(Deployer):
                 # Wait for ingress to start
                 # (deployment provisioning will fail unless this is done)
                 wait_for_ingress_in_kind()
+                # Create RuntimeClass if unlimited_memlock is enabled
+                if self.cluster_info.spec.get_unlimited_memlock():
+                    _create_runtime_class(
+                        constants.high_memlock_runtime,
+                        constants.high_memlock_runtime,
+                    )
 
         else:
             print("Dry run mode enabled, skipping k8s API connect")
@@ -669,17 +705,19 @@ class K8sDeployerConfigGenerator(DeployerConfigGenerator):
     def generate(self, deployment_dir: Path):
         # No need to do this for the remote k8s case
         if self.type == "k8s-kind":
-            # Generate cri-base.json if unlimited_memlock is enabled.
+            # Generate high-memlock-spec.json if unlimited_memlock is enabled.
             # Must be done before generate_kind_config() which references it.
             if self.deployment_context.spec.get_unlimited_memlock():
-                cri_base_content = generate_cri_base_json()
-                cri_base_file = deployment_dir.joinpath(constants.cri_base_filename)
+                spec_content = generate_high_memlock_spec_json()
+                spec_file = deployment_dir.joinpath(
+                    constants.high_memlock_spec_filename
+                )
                 if opts.o.debug:
                     print(
-                        f"Creating cri-base.json for unlimited memlock: {cri_base_file}"
+                        f"Creating high-memlock spec for unlimited memlock: {spec_file}"
                     )
-                with open(cri_base_file, "w") as output_file:
-                    output_file.write(cri_base_content)
+                with open(spec_file, "w") as output_file:
+                    output_file.write(spec_content)
 
             # Check the file isn't already there
             # Get the config file contents
diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py
index f5fc8a43..99876140 100644
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@@ -317,17 +317,19 @@ def _generate_kind_port_mappings(parsed_pod_files):
     )
 
 
-def _generate_cri_base_mount(deployment_dir: Path):
-    """Generate the extraMount entry for cri-base.json to set RLIMIT_MEMLOCK."""
-    cri_base_path = deployment_dir.joinpath(constants.cri_base_filename).resolve()
-    return (
-        f"  - hostPath: {cri_base_path}\n"
-        f"    containerPath: /etc/containerd/cri-base.json\n"
-    )
+def _generate_high_memlock_spec_mount(deployment_dir: Path):
+    """Generate the extraMount entry for high-memlock-spec.json.
+
+    The spec file must be mounted at the same path inside the kind node
+    as it appears on the host, because containerd's base_runtime_spec
+    references an absolute path.
+    """
+    spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve()
+    return f"  - hostPath: {spec_path}\n" f"    containerPath: {spec_path}\n"
 
 
-def generate_cri_base_json():
-    """Generate cri-base.json content with unlimited RLIMIT_MEMLOCK.
+def generate_high_memlock_spec_json():
+    """Generate OCI spec JSON with unlimited RLIMIT_MEMLOCK.
 
     This is needed for workloads like Solana validators that require large
     amounts of locked memory for memory-mapped files during snapshot decompression.
@@ -339,7 +341,7 @@ def generate_cri_base_json():
 
     # Use maximum 64-bit signed integer value for unlimited
     max_rlimit = 9223372036854775807
-    cri_base = {
+    spec = {
         "ociVersion": "1.0.2-dev",
         "process": {
             "rlimits": [
@@ -348,7 +350,36 @@ def generate_cri_base_json():
             ]
         },
     }
-    return json.dumps(cri_base, indent=2)
+    return json.dumps(spec, indent=2)
+
+
+# Keep old name as alias for backward compatibility
+def generate_cri_base_json():
+    """Deprecated: Use generate_high_memlock_spec_json() instead."""
+    return generate_high_memlock_spec_json()
+
+
+def _generate_containerd_config_patches(
+    deployment_dir: Path, has_high_memlock: bool
+) -> str:
+    """Generate containerdConfigPatches YAML for custom runtime handlers.
+
+    This configures containerd to have a runtime handler named 'high-memlock'
+    that uses a custom OCI base spec with unlimited RLIMIT_MEMLOCK.
+    """
+    if not has_high_memlock:
+        return ""
+
+    spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve()
+    runtime_name = constants.high_memlock_runtime
+    plugin_path = 'plugins."io.containerd.grpc.v1.cri".containerd.runtimes'
+    return (
+        "containerdConfigPatches:\n"
+        "  - |-\n"
+        f"    [{plugin_path}.{runtime_name}]\n"
+        '      runtime_type = "io.containerd.runc.v2"\n'
+        f'      base_runtime_spec = "{spec_path}"\n'
+    )
 
 
 # Note: this makes any duplicate definition in b overwrite a
@@ -430,19 +461,30 @@ def generate_kind_config(deployment_dir: Path, deployment_context):
         parsed_pod_files_map, deployment_dir, deployment_context
     )
 
-    # Check if unlimited_memlock is enabled and add cri-base.json mount
+    # Check if unlimited_memlock is enabled
     unlimited_memlock = deployment_context.spec.get_unlimited_memlock()
+
+    # Generate containerdConfigPatches for RuntimeClass support
+    containerd_patches_yml = _generate_containerd_config_patches(
+        deployment_dir, unlimited_memlock
+    )
+
+    # Add high-memlock spec file mount if needed
     if unlimited_memlock:
-        cri_base_mount = _generate_cri_base_mount(deployment_dir)
+        spec_mount = _generate_high_memlock_spec_mount(deployment_dir)
         if mounts_yml:
             # Append to existing mounts
-            mounts_yml = mounts_yml.rstrip() + "\n" + cri_base_mount
+            mounts_yml = mounts_yml.rstrip() + "\n" + spec_mount
         else:
-            mounts_yml = f"  extraMounts:\n{cri_base_mount}"
+            mounts_yml = f"  extraMounts:\n{spec_mount}"
 
-    return (
-        "kind: Cluster\n"
-        "apiVersion: kind.x-k8s.io/v1alpha4\n"
+    # Build the config - containerdConfigPatches must be at cluster level (before nodes)
+    config = "kind: Cluster\n" "apiVersion: kind.x-k8s.io/v1alpha4\n"
+
+    if containerd_patches_yml:
+        config += containerd_patches_yml
+
+    config += (
         "nodes:\n"
         "- role: control-plane\n"
         "  kubeadmConfigPatches:\n"
@@ -454,3 +496,5 @@ def generate_kind_config(deployment_dir: Path, deployment_context):
         f"{port_mappings_yml}\n"
         f"{mounts_yml}\n"
     )
+
+    return config
diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py
index b6defc17..1713f28a 100644
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@@ -153,6 +153,29 @@ class Spec:
             ).lower()
         )
 
+    def get_runtime_class(self):
+        """Get runtime class name from spec, or derive from security settings.
+
+        The runtime class determines which containerd runtime handler to use,
+        allowing different pods to have different rlimit profiles (e.g., for
+        unlimited RLIMIT_MEMLOCK).
+
+        Returns:
+            Runtime class name string, or None to use default runtime.
+        """
+        # Explicit runtime class takes precedence
+        explicit = self.obj.get(constants.security_key, {}).get(
+            constants.runtime_class_key, None
+        )
+        if explicit:
+            return explicit
+
+        # Auto-derive from unlimited-memlock setting
+        if self.get_unlimited_memlock():
+            return constants.high_memlock_runtime
+
+        return None  # Use default runtime
+
     def get_deployment_type(self):
         return self.obj.get(constants.deploy_to_key)