Replatforming RKE1 to Nix-based K8s - Part 1

This article is part of the Replatform RKE1 to Nix series.

    RKE1 (Rancher Kubernetes Engine 1) was Rancher’s first way of automatically deploying Kubernetes to a cluster. Think of it like minikube or on-prem EKS or K3s. Three years ago, it was marked as end of life (EoL) with the last release being July 2025. They have no migration guide and their strategy is just rebuild the cluster. I have 3 nodes a bunch of services running in Kubernetes. I don’t want to take everything down and rebuild it all. Let’s rebuild it while the plane is in the air.

    In this post, I’m going to take an existing NixOS worker node that used RKE1 to deploy the control plane and worker and recreate it entirely in-place with etcd with minimal

    NixOS is a a declarative, reproducible operating system

    I was already using NixOS to declaratively define 1 out of my 3 worker’s operating system and I wanted to see how it works at configuring Kubernetes. I had two still using Ubuntu Server and I wanted to consolidate into a single approach. I had my issues with Nix, but for servers once it’s done, it works reasonably well.

    First Cut

    flake.nix:

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    
    {
      description = "Nixos config flake";
    
      inputs = {
        nixpkgs.url = "github:nixos/nixpkgs";
        nixos-hardware.url = "github:NixOS/nixos-hardware/master";
        disko = {
          url = "github:nix-community/disko";
          inputs.nixpkgs.follows = "nixpkgs";
        };
      };
    
      outputs = { self, nixpkgs, disko, ... }@inputs: {
        nixosConfigurations.srv7 = nixpkgs.lib.nixosSystem {
          specialArgs = {inherit inputs;};
          modules = [
            # ... 
            ./parts/kubernetes.nix
          ];
        };
      };
    }
    

    kubernetes.nix:

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    
    { config, lib, pkgs, nixpkgs-k8s, ... }:
    let
      kubeMasterHostname = "localhost";
      kubeMasterAPIServerPort = 6443;
      clusterIpv4 = { "srv5" = "51.81.64.31"; };
      hostIpv4 = clusterIpv4."${config.networking.hostName}";
      hostIpv6 = { "srv5" = "2604:2dc0:100:1be8:beef:beef:beef:beef"; }."${config.networking.hostName}";
      sslBasePath = "/etc/kubernetes/ssl/";
      ipWithHyphens = builtins.replaceStrings ["."] ["-"] hostIpv4;
    in
    {
      services.etcd = {
        advertiseClientUrls = ["https://${hostIpv4}:2379"];
        listenClientUrls = ["https://0.0.0.0:2379" ];
        listenPeerUrls = ["https://0.0.0.0:2380" ];
        initialAdvertisePeerUrls = ["https://${hostIpv4}:2380"];
        certFile = "${sslBasePath}kube-etcd-${ipWithHyphens}.pem";
        initialClusterToken = "etcd-cluster-1";
        peerClientCertAuth = true;
        clientCertAuth = true;
        initialCluster = (lib.mapAttrsToList (n: v: "etcd-${n}=https://${v}:2380") clusterIpv4);
        keyFile = "${sslBasePath}kube-etcd-${ipWithHyphens}-key.pem";
        name = "etcd-${config.networking.hostName}";
        openFirewall = true;
        trustedCaFile = "${sslBasePath}kube-ca.pem";
        enable = true;
      };
    
      environment.systemPackages = with pkgs; [
        pkgs.cri-tools
      ];
    
      services.kubernetes = {
        package = pkgs.kubernetes;
        masterAddress = kubeMasterHostname;
        apiserverAddress = "https://${hostIpv4}:${toString kubeMasterAPIServerPort}";
        apiserver = {
          advertiseAddress = hostIpv4;
          allowPrivileged = true;
          apiAudiences = "unknown";
          enable = true;
          etcd = {
            servers = [
              # Only connect to local etcd to avoid cross node calls
              "https://${hostIpv4}:2379"
            ];
            keyFile = "${sslBasePath}kube-node-key.pem";
            certFile = "${sslBasePath}kube-node.pem";
            caFile = "${sslBasePath}kube-ca.pem";
          };
          extraOpts = "--etcd-prefix=/registry --service-account-lookup=true --anonymous-auth=false --service-node-port-range=30000-32767";
          kubeletClientCertFile = "${sslBasePath}kube-apiserver.pem";
          kubeletClientKeyFile = "${sslBasePath}kube-apiserver-key.pem";
          proxyClientCertFile = lib.mkForce "${sslBasePath}kube-apiserver-proxy-client.pem";
          proxyClientKeyFile = lib.mkForce "${sslBasePath}kube-apiserver-proxy-client-key.pem";
          securePort = kubeMasterAPIServerPort;
          serviceAccountKeyFile = lib.mkForce "${sslBasePath}kube-service-account-token-key.pem";
          serviceAccountIssuer = "rke";
          serviceAccountSigningKeyFile = lib.mkForce "${sslBasePath}kube-service-account-token-key.pem";
          serviceClusterIpRange = "10.43.0.0/16";
          tlsCertFile = lib.mkForce "${sslBasePath}kube-apiserver.pem";
          tlsKeyFile = lib.mkForce "${sslBasePath}kube-apiserver-key.pem";
        };
        scheduler = {
          enable = true;
          kubeconfig = {
            certFile = "${sslBasePath}kube-scheduler.pem";
            keyFile = "${sslBasePath}kube-scheduler-key.pem";
          };
        };
        controllerManager = {
          enable = true;
          extraOpts = "--service-cluster-ip-range=10.43.0.0/16";
          serviceAccountKeyFile = lib.mkForce "${sslBasePath}kube-service-account-token-key.pem";
          kubeconfig = {
            certFile = "${sslBasePath}kube-controller-manager.pem";
            keyFile = "${sslBasePath}kube-controller-manager-key.pem";
          };
        };
        pki.enable = false; # Skip Nix's cert generation. We'll use our own. We still need to handle rotation
        flannel.enable = false;
        addonManager.enable = false;
        caFile = lib.mkForce "${sslBasePath}kube-ca.pem";
        clusterCidr = "10.42.0.0/16";
    
        proxy = {
          enable = true;
          kubeconfig = {
            caFile = "${sslBasePath}kube-ca.pem";
            certFile = "${sslBasePath}kube-proxy.pem";
            keyFile = "${sslBasePath}kube-proxy-key.pem";
          };
        };
    
        kubelet = {
          clusterDns = ["10.43.0.10"];
          cni.packages = [];
          enable = true;
          hostname = config.networking.hostName;
          kubeconfig = {
            keyFile = "${sslBasePath}kube-node-key.pem";
            certFile = "${sslBasePath}kube-node.pem";
          };
          tlsCertFile = "${sslBasePath}kube-node.pem";
          tlsKeyFile = "${sslBasePath}kube-node-key.pem";
          nodeIp = "${hostIpv4},${hostIpv6}";
          extraOpts = "--kube-reserved=cpu=1000m,memory=512Mi --root-dir=/var/lib/kubelet";
        };
      };
    }
    

    SSL

    First problem is trivial to fix because the apiserver runs as user kubernetes:

    1
    2
    3
    4
    
    Sep 17 21:34:43 srv5 kube-apiserver[29733]: I0917 21:34:43.581096   29733 options.go:249] external host was not specified, using 51.81.64.31
    Sep 17 21:34:43 srv5 kube-apiserver[29733]: E0917 21:34:43.581258   29733 run.go:72] "command failed" err="failed to parse service-account-issuer-key-file: open /etc/kubernetes/ssl/kube-service-account-token-key.pem: permission denied"
    Sep 17 21:34:43 srv5 systemd[1]: kube-apiserver.service: Main process exited, code=exited, status=1/FAILURE
    Sep 17 21:34:43 srv5 systemd[1]: kube-apiserver.service: Failed with result 'exit-code'.
    
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    
    [nix-shell:/etc/nixos]# systemctl show kube-apiserver | grep User
    User=kubernetes
    DynamicUser=no
    PrivateUsers=no
    PrivateUsersEx=no
    
    [nix-shell:/etc/kubernetes/ssl]# ls -la
    total 128
    drwxr-xr-x 1 root root 1696 Mar  6  2025 .
    drwxr-xr-x 1 root root   68 Sep 17 20:17 ..
    -rw------- 1 root root 1675 Sep  4  2024 kube-apiserver-key.pem
    -rw------- 1 root root 1330 Sep  4  2024 kube-apiserver.pem
    -rw------- 1 root root 1679 Sep  4  2024 kube-apiserver-proxy-client-key.pem
    -rw------- 1 root root 1151 Sep  4  2024 kube-apiserver-proxy-client.pem
    -rw------- 1 root root 1679 Sep  4  2024 kube-apiserver-requestheader-ca-key.pem
    -rw------- 1 root root 1123 Sep  4  2024 kube-apiserver-requestheader-ca.pem
    -rw------- 1 root root 1675 Sep  4  2024 kube-ca-key.pem
    -rw------- 1 root root 1058 Sep  4  2024 kube-ca.pem
    ...
    

    Fix it by updating permissions:

    1
    2
    3
    
    chown kubernetes:kubernetes /etc/kubernetes/ssl/*.pem
    chmod o+r /etc/kubernetes/ssl/kube-ca.pem
    chown etcd /etc/kubernetes/ssl/kube-etcd-*.pem
    

    Pause image

    Kubernetes and containerd depend on something called a pause image which is like a Docker image that contains a single process that is used to setup resource namespaces. In my opinion, it’s a leaky abstraction that was first introduced due to Docker’s limitations, then carried forward into containerd, but shouldn’t have been exposed.

    1
    
    Failed to create pod sandbox: rpc error: code  = Unknown desc = failed to start sandbox "736e57677163ae0948aaa7425f9dee4776a84ac65eeab146b4303ab60820b990": failed to get sandbox image "pause:latest": failed to pull image "pause:latest": failed to pull and unpack image "docker.io/library/pause:latest": failed to resolve image: pull access denied, repository does not exist or may require   authorization: server message: insufficient_scope: authorization failed
    

    Startup depends on a pause image, but the pause image can get garbage collected. Let’s see what’s going on.

    [nix-shell:/etc/nixos]# cat /nix/store/0k19qdzb5vygrg8s6d0rvjghl1p0qjqa-containerd-config-checked.toml

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    
    oom_score = 0
    root = "/var/lib/containerd"
    state = "/run/containerd"
    version = 2
    
    [grpc]
    address = "/run/containerd/containerd.sock"
    
    [plugins."io.containerd.grpc.v1.cri"]
    sandbox_image = "pause:latest"
    
    [plugins."io.containerd.grpc.v1.cri".cni]
    bin_dir = "/opt/cni/bin"
    max_conf_num = 0
    
    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
    runtime_type = "io.containerd.runc.v2"
    
    [plugins."io.containerd.grpc.v1.cri".containerd
    

    Now how do I pin the images? Several issues pointed the blame in different ways.

    I ended up with this work around:

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    
    { config, lib, pkgs, ... }:
    let
      # ...
      infraContainer = pkgs.dockerTools.buildImage {
        name = "pause";
        tag = "latest";
        copyToRoot = pkgs.buildEnv {
          name = "image-root";
          pathsToLink = [ "/bin" ];
          paths = [ pkgs.kubernetes.pause ];
        };
        config.Cmd = [ "/bin/pause" ];
      };
    in
    {
      systemd.services.kubelet = {
        preStart = lib.mkForce ''
          set -e
          ${pkgs.containerd}/bin/ctr -n k8s.io image import --label io.cri-containerd.pinned=pinned ${infraContainer}
        '';
      };
    }
    

    However, after finishing my cluster, I came across this Nixpkgs commit that fixes the issue, but doesn’t work in my case because of the next issue.

    Dude, where’s my CNI?

    1
    
    Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "65949aeccfed7a3571f721c79e1d5e02d02161a3a3efa343936b34a0480a1f2b": plugin type="calico" failed (add): failed to find plugin "calico" in path [/opt/cni/bin]
    

    The next problem is caused by the Calico CNI driver getting deleted when kubelet restarts. The calico-driver-deployer DaemonSet is responsible for deploying it, but Nixpkgs has this line that deletes all CNI binaries. Intended to be reproducible, it breaks any custom CNI deployments.

    1
    2
    3
    4
    5
    6
    7
    
    preStart = ''
        // ...
    	rm /opt/cni/bin/* || true
    	${concatMapStrings (package: ''
    		echo "Linking cni package: ${package}"
    		ln -fs ${package}/bin/* /opt/cni/bin
    		'') cfg.cni.packages}
    

    Thus, I have to disable this preStart code.

    kubernetes.nix:

    1
    2
    3
    4
    5
    6
    
    {
      systemd.services.kubelet.preStart = lib.mkForce ''
        mkdir -p /opt/cni/bin/
        ${pkgs.containerd}/bin/ctr -n k8s.io image import --label io.cri-containerd.pinned=pinned ${infraContainer}
      '';
    }
    

    Why’s my service cidr broken?

    This one was very strange. I saw some errors in Kubernetes:

    1
    2
    3
    4
    5
    6
    
    kind: Event
    message: >-
      Cluster IP [IPv4]: 10.43.27.126 is not within any configured Service CIDR;
      please recreate service
    series:
      count: 2194
    

    Based on my reading, I think I accidentally swapped my serviceClusterIpRange and clusterIpRange in the configuration at some point and it got stuck in my cluster.

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    
    journalctl -u kube-apiserver
    
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.386392  825540 cidrallocator.go:301] created ClusterIP allocator for Service CIDR 10.0.0.0/24
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.333430  825540 cache.go:32] Waiting for caches to sync for APIServiceRegistrationController controller  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.386392  825540 cidrallocator.go:301] created ClusterIP allocator for Service CIDR 10.0.0.0/24  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.433485  825540 cache.go:39] Caches are synced for APIServiceRegistrationController controller  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.444773  825540 default_servicecidr_controller.go:227] inconsistent ServiceCIDR status, global configura  
    tion: [10.0.0.0/24] local configuration: [10.43.0.0/16], configure the flags to match current ServiceCIDR or manually delete the default ServiceCIDR  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: I1003 22:28:21.444968  825540 event.go:389] "Event occurred" object="kubernetes" fieldPath="" kind="ServiceCIDR" apiVe  
    rsion="networking.k8s.io/v1" type="Warning" reason="KubernetesDefaultServiceCIDRInconsistent" message="The default ServiceCIDR [10.0.0.0/24] does not match the fla  
    g configurations [10.43.0.0/16]"  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: E1003 22:28:21.535706  825540 repairip.go:353] "Unhandled Error" err="the ClusterIP [IPv4]: 10.43.28.139 for Service c  
    attle-system/rancher-webhook is not within any service CIDR; please recreate" logger="UnhandledError"  
    Oct 03 22:28:21 srv5 kube-apiserver[825540]: E1003 22:28:21.537293  825540 repairip.go:353] "Unhandled Error" err="the ClusterIP [IPv4]: 10.43.192.190 for Service datastore/mysql-lb is not within any service CIDR; please recreate" logger="UnhandledError"  
    ...
    

    Searching online found a few issues, but the solution wasn’t obvious:

    There was a hidden resource servicecidrs/kubernetes that kubectl was unable to modify. The only way to fix it was actually to edit the backing store for Kubernetes, in etcd.

    Looking in etcd, I found the culprit. Note content is stored in binary.

    1
    2
    3
    4
    5
    6
    7
    
    docker exec -ti etcd etcdctl get /registry/servicecidrs/kubernetes  
    /registry/servicecidrs/kubernetes  
    ....
    
    10.0.0.0/24▒?  
    =  
    ReadyTrue����*2 Kubernetes Service CIDR is ready▒"
    

    To fix it, run the following commands.

    1
    2
    3
    4
    5
    
    docker exec -ti etcd etcdctl get /registry/servicecidrs/kubernetes > backup
    
    kubectl delete service kubernetes
    
    docker exec -ti etcd etcdctl del /registry/servicecidrs/kubernetes
    

    Then restart kube-apiserver. It’ll auto auto recreate it and it’s fixed. However, don’t be like me and just configure your Nix flake correctly the first time.

    Pin Kubernetes

    One of my problems with Nix is that when you use Nixpkgs, you almost always end up using the latest version of every package.

    1
    2
    3
    
    {
      services.kubernetes.apiserver.enable = true;
    }
    

    Doing the above means one day you’re using Kubernetes 1.33.4, and the next day you’re on 1.34.0. Kubernetes upgrades aren’t safe to be done blindly. Nodes need to updated after the control plane and features get deprecated. I don’t mind automatically performing minor updates like 1.33.3 to 1.33.4, but major updates need to be opt-in.

    We can pin to a specific version like this:

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    
    { config, lib, pkgs, ... }:
    let
    
      package = pkgs.kubernetes.overrideAttrs (oldAttrs: rec {
        version = "1.34.2";
    
        src = pkgs.fetchFromGitHub {
          owner = "kubernetes";
          repo = "kubernetes";
          rev = "v${version}";
          sha256 = "3rQyoGt9zTeF8+PIhA5p+hHY1V5O8CawvKWscf/r9RM=";
        };
      });
    in
    {
      services.kubernetes.package = package;
    }
    

    This isn’t yet perfect because it doesn’t ensure that the api server is updated first across the cluster as per the official docs, but it’s a first step.

    Conclusion

    In this post, I walked through how to take a Kubernetes node running RKE1 and deploy Kubernetes via Nix+Nixpkgs instead of RKE1. In my next post, I’ll show to swap a Ubuntu server to NixOS using nixos-anywhere

    Copyright - All Rights Reserved

    Comments

    To give feedback, send an email to adam [at] this website url.

    Other Posts in Series

    This post is part of the Replatform RKE1 to Nix series. You can check out the other posts for more information:

      Donate

      If you've found these posts helpful and would like to support this work directly, your contribution would be appreciated and enable me to dedicate more time to creating future posts. Thank you for joining me!

      Donate to my blog