diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index 322b57eb..49dfa193 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -41,3 +41,6 @@ kind_config_filename = "kind-config.yml" kube_config_filename = "kubeconfig.yml" cri_base_filename = "cri-base.json" unlimited_memlock_key = "unlimited-memlock" +runtime_class_key = "runtime-class" +high_memlock_runtime = "high-memlock" +high_memlock_spec_filename = "high-memlock-spec.json" diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index bd539e30..97a5651f 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -531,6 +531,7 @@ class ClusterInfo: volumes=volumes, affinity=affinity, tolerations=tolerations, + runtime_class_name=self.spec.get_runtime_class(), ), ) spec = client.V1DeploymentSpec( diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 38867dab..cf8f564f 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -37,7 +37,7 @@ from stack_orchestrator.deploy.k8s.helpers import ( ) from stack_orchestrator.deploy.k8s.helpers import ( generate_kind_config, - generate_cri_base_json, + generate_high_memlock_spec_json, ) from stack_orchestrator.deploy.k8s.cluster_info import ClusterInfo from stack_orchestrator.opts import opts @@ -59,6 +59,36 @@ def _check_delete_exception(e: ApiException) -> None: error_exit(f"k8s api error: {e}") +def _create_runtime_class(name: str, handler: str): + """Create a RuntimeClass resource for custom containerd runtime handlers. + + RuntimeClass allows pods to specify which runtime handler to use, enabling + different pods to have different rlimit profiles (e.g., high-memlock). + + Args: + name: The name of the RuntimeClass resource + handler: The containerd runtime handler name + (must match containerdConfigPatches) + """ + api = client.NodeV1Api() + runtime_class = client.V1RuntimeClass( + api_version="node.k8s.io/v1", + kind="RuntimeClass", + metadata=client.V1ObjectMeta(name=name), + handler=handler, + ) + try: + api.create_runtime_class(runtime_class) + if opts.o.debug: + print(f"Created RuntimeClass: {name}") + except ApiException as e: + if e.status == 409: # Already exists + if opts.o.debug: + print(f"RuntimeClass {name} already exists") + else: + raise + + class K8sDeployer(Deployer): name: str = "k8s" type: str @@ -275,6 +305,12 @@ class K8sDeployer(Deployer): # Wait for ingress to start # (deployment provisioning will fail unless this is done) wait_for_ingress_in_kind() + # Create RuntimeClass if unlimited_memlock is enabled + if self.cluster_info.spec.get_unlimited_memlock(): + _create_runtime_class( + constants.high_memlock_runtime, + constants.high_memlock_runtime, + ) else: print("Dry run mode enabled, skipping k8s API connect") @@ -669,17 +705,19 @@ class K8sDeployerConfigGenerator(DeployerConfigGenerator): def generate(self, deployment_dir: Path): # No need to do this for the remote k8s case if self.type == "k8s-kind": - # Generate cri-base.json if unlimited_memlock is enabled. + # Generate high-memlock-spec.json if unlimited_memlock is enabled. # Must be done before generate_kind_config() which references it. if self.deployment_context.spec.get_unlimited_memlock(): - cri_base_content = generate_cri_base_json() - cri_base_file = deployment_dir.joinpath(constants.cri_base_filename) + spec_content = generate_high_memlock_spec_json() + spec_file = deployment_dir.joinpath( + constants.high_memlock_spec_filename + ) if opts.o.debug: print( - f"Creating cri-base.json for unlimited memlock: {cri_base_file}" + f"Creating high-memlock spec for unlimited memlock: {spec_file}" ) - with open(cri_base_file, "w") as output_file: - output_file.write(cri_base_content) + with open(spec_file, "w") as output_file: + output_file.write(spec_content) # Check the file isn't already there # Get the config file contents diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index f5fc8a43..99876140 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -317,17 +317,19 @@ def _generate_kind_port_mappings(parsed_pod_files): ) -def _generate_cri_base_mount(deployment_dir: Path): - """Generate the extraMount entry for cri-base.json to set RLIMIT_MEMLOCK.""" - cri_base_path = deployment_dir.joinpath(constants.cri_base_filename).resolve() - return ( - f" - hostPath: {cri_base_path}\n" - f" containerPath: /etc/containerd/cri-base.json\n" - ) +def _generate_high_memlock_spec_mount(deployment_dir: Path): + """Generate the extraMount entry for high-memlock-spec.json. + + The spec file must be mounted at the same path inside the kind node + as it appears on the host, because containerd's base_runtime_spec + references an absolute path. + """ + spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve() + return f" - hostPath: {spec_path}\n" f" containerPath: {spec_path}\n" -def generate_cri_base_json(): - """Generate cri-base.json content with unlimited RLIMIT_MEMLOCK. +def generate_high_memlock_spec_json(): + """Generate OCI spec JSON with unlimited RLIMIT_MEMLOCK. This is needed for workloads like Solana validators that require large amounts of locked memory for memory-mapped files during snapshot decompression. @@ -339,7 +341,7 @@ def generate_cri_base_json(): # Use maximum 64-bit signed integer value for unlimited max_rlimit = 9223372036854775807 - cri_base = { + spec = { "ociVersion": "1.0.2-dev", "process": { "rlimits": [ @@ -348,7 +350,36 @@ def generate_cri_base_json(): ] }, } - return json.dumps(cri_base, indent=2) + return json.dumps(spec, indent=2) + + +# Keep old name as alias for backward compatibility +def generate_cri_base_json(): + """Deprecated: Use generate_high_memlock_spec_json() instead.""" + return generate_high_memlock_spec_json() + + +def _generate_containerd_config_patches( + deployment_dir: Path, has_high_memlock: bool +) -> str: + """Generate containerdConfigPatches YAML for custom runtime handlers. + + This configures containerd to have a runtime handler named 'high-memlock' + that uses a custom OCI base spec with unlimited RLIMIT_MEMLOCK. + """ + if not has_high_memlock: + return "" + + spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve() + runtime_name = constants.high_memlock_runtime + plugin_path = 'plugins."io.containerd.grpc.v1.cri".containerd.runtimes' + return ( + "containerdConfigPatches:\n" + " - |-\n" + f" [{plugin_path}.{runtime_name}]\n" + ' runtime_type = "io.containerd.runc.v2"\n' + f' base_runtime_spec = "{spec_path}"\n' + ) # Note: this makes any duplicate definition in b overwrite a @@ -430,19 +461,30 @@ def generate_kind_config(deployment_dir: Path, deployment_context): parsed_pod_files_map, deployment_dir, deployment_context ) - # Check if unlimited_memlock is enabled and add cri-base.json mount + # Check if unlimited_memlock is enabled unlimited_memlock = deployment_context.spec.get_unlimited_memlock() + + # Generate containerdConfigPatches for RuntimeClass support + containerd_patches_yml = _generate_containerd_config_patches( + deployment_dir, unlimited_memlock + ) + + # Add high-memlock spec file mount if needed if unlimited_memlock: - cri_base_mount = _generate_cri_base_mount(deployment_dir) + spec_mount = _generate_high_memlock_spec_mount(deployment_dir) if mounts_yml: # Append to existing mounts - mounts_yml = mounts_yml.rstrip() + "\n" + cri_base_mount + mounts_yml = mounts_yml.rstrip() + "\n" + spec_mount else: - mounts_yml = f" extraMounts:\n{cri_base_mount}" + mounts_yml = f" extraMounts:\n{spec_mount}" - return ( - "kind: Cluster\n" - "apiVersion: kind.x-k8s.io/v1alpha4\n" + # Build the config - containerdConfigPatches must be at cluster level (before nodes) + config = "kind: Cluster\n" "apiVersion: kind.x-k8s.io/v1alpha4\n" + + if containerd_patches_yml: + config += containerd_patches_yml + + config += ( "nodes:\n" "- role: control-plane\n" " kubeadmConfigPatches:\n" @@ -454,3 +496,5 @@ def generate_kind_config(deployment_dir: Path, deployment_context): f"{port_mappings_yml}\n" f"{mounts_yml}\n" ) + + return config diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index b6defc17..1713f28a 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -153,6 +153,29 @@ class Spec: ).lower() ) + def get_runtime_class(self): + """Get runtime class name from spec, or derive from security settings. + + The runtime class determines which containerd runtime handler to use, + allowing different pods to have different rlimit profiles (e.g., for + unlimited RLIMIT_MEMLOCK). + + Returns: + Runtime class name string, or None to use default runtime. + """ + # Explicit runtime class takes precedence + explicit = self.obj.get(constants.security_key, {}).get( + constants.runtime_class_key, None + ) + if explicit: + return explicit + + # Auto-derive from unlimited-memlock setting + if self.get_unlimited_memlock(): + return constants.high_memlock_runtime + + return None # Use default runtime + def get_deployment_type(self): return self.obj.get(constants.deploy_to_key)