kubernetes version: 1.26
场景: 创建一个 deployment
1. kube-apiserver
- api 接口收到创建请求
- 写入 etcd,更新资源
- kube-controller-manager 通过 watch deployment 资源变化
2. kube-controller-manager
- 初始化 controller (deploy, relicat)
- 进入 deploy 同步的 loop 中 k8s.io/kubernetes/pkg/controller/deployment/deployment_controller.go: syncDeployment
- 创建 replicaSet
- 进入 replicaSet 同步 loop 中 k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go:syncReplicaSet
- 创建 pod
各种控制器
func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc {
// 省略部分。。。
register("endpoint", startEndpointController)
register("endpointslice", startEndpointSliceController)
register("endpointslicemirroring", startEndpointSliceMirroringController)
register("replicationcontroller", startReplicationController)
register("podgc", startPodGCController)
register("resourcequota", startResourceQuotaController)
register("namespace", startNamespaceController)
register("serviceaccount", startServiceAccountController)
register("garbagecollector", startGarbageCollectorController)
register("daemonset", startDaemonSetController)
register("job", startJobController)
register("deployment", startDeploymentController)
register("replicaset", startReplicaSetController)
register("horizontalpodautoscaling", startHPAController)
register("disruption", startDisruptionController)
register("statefulset", startStatefulSetController)
register("cronjob", startCronJobController)
register("csrsigning", startCSRSigningController)
register("csrapproving", startCSRApprovingController)
register("csrcleaner", startCSRCleanerController)
register("ttl", startTTLController)
register("bootstrapsigner", startBootstrapSignerController)
register("tokencleaner", startTokenCleanerController)
register("nodeipam", startNodeIpamController)
register("nodelifecycle", startNodeLifecycleController)
if loopMode == IncludeCloudLoops {
register("service", startServiceController)
register("route", startRouteController)
register("cloud-node-lifecycle", startCloudNodeLifecycleController)
// TODO: volume controller into the IncludeCloudLoops only set.
}
register("persistentvolume-binder", startPersistentVolumeBinderController)
register("attachdetach", startAttachDetachController)
register("persistentvolume-expander", startVolumeExpandController)
register("clusterrole-aggregation", startClusterRoleAggregrationController)
register("pvc-protection", startPVCProtectionController)
register("pv-protection", startPVProtectionController)
register("ttl-after-finished", startTTLAfterFinishedController)
register("root-ca-cert-publisher", startRootCACertPublisher)
register("ephemeral-volume", startEphemeralVolumeController)
if utilfeature.DefaultFeatureGate.Enabled(genericfeatures.APIServerIdentity) &&
utilfeature.DefaultFeatureGate.Enabled(genericfeatures.StorageVersionAPI) {
register("storage-version-gc", startStorageVersionGCController)
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) {
controllers["resource-claim-controller"] = startResourceClaimController
}
return controllers
}
3. kube-scheduler
// k8s 调度过程中通过插件进行筛选, 默认组件如下
k8s.io/kubernetes/pkg/scheduler/apis/config/v1/default_plugins.go
func getDefaultPlugins() *v1.Plugins {
plugins := &v1.Plugins{
MultiPoint: v1.PluginSet{
Enabled: []v1.Plugin{
{Name: names.PrioritySort},
{Name: names.NodeUnschedulable},
{Name: names.NodeName},
{Name: names.TaintToleration, Weight: pointer.Int32(3)},
{Name: names.NodeAffinity, Weight: pointer.Int32(2)},
{Name: names.NodePorts},
{Name: names.NodeResourcesFit, Weight: pointer.Int32(1)},
{Name: names.VolumeRestrictions},
{Name: names.EBSLimits},
{Name: names.GCEPDLimits},
{Name: names.NodeVolumeLimits},
{Name: names.AzureDiskLimits},
{Name: names.VolumeBinding},
{Name: names.VolumeZone},
{Name: names.PodTopologySpread, Weight: pointer.Int32(2)},
{Name: names.InterPodAffinity, Weight: pointer.Int32(2)},
{Name: names.DefaultPreemption},
{Name: names.NodeResourcesBalancedAllocation, Weight: pointer.Int32(1)},
{Name: names.ImageLocality, Weight: pointer.Int32(1)},
{Name: names.DefaultBinder},
},
},
}
applyFeatureGates(plugins)
return plugins
}
//调度入口:
// k8s.io/kubernetes/pkg/scheduler/schedule_one.go
func (sched *Scheduler) scheduleOne(ctx context.Context) {
podInfo := sched.NextPod()
// pod could be nil when schedulerQueue is closed
if podInfo == nil || podInfo.Pod == nil {
return
}
pod := podInfo.Pod
// 根据 pod 选定调度器
fwk, err := sched.frameworkForPod(pod)
// 省略部分...
// 进行调度
scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate)
if !status.IsSuccess() {
sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start)
return
}
}
func (sched *Scheduler) schedulingCycle(
ctx context.Context,
state *framework.CycleState,
fwk framework.Framework,
podInfo *framework.QueuedPodInfo,
start time.Time,
podsToActivate *framework.PodsToActivate,
) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) {
pod := podInfo.Pod
// k8s.io/kubernetes/pkg/scheduler/schedule_one.go:schedulePod
scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod)
// 省略部分...
return scheduleResult, assumedPodInfo, nil
}
// 真正的调度执行方法
func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
// node 快照
if err := sched.Cache.UpdateSnapshot(sched.nodeInfoSnapshot); err != nil {
return result, err
}
// 判断是否存在可用 node
if sched.nodeInfoSnapshot.NumNodes() == 0 {
return result, ErrNoNodesAvailable
}
feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod)
if err != nil {
return result, err
}
// 优选
priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes)
if err != nil {
return result, err
}
// 从优选的列表中按照得分进行选取可用 node
host, err := selectHost(priorityList)
trace.Step("Prioritizing done")
return ScheduleResult{
SuggestedHost: host,
EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
FeasibleNodes: len(feasibleNodes),
}, err
}
// 优选:
// k8s.io/kubernetes/pkg/scheduler/framework/interface.go
// 根据多种类型插件实现打分方法
type ScorePlugin interface {
Plugin
// Score is called on each filtered node. It must return success and an integer
// indicating the rank of the node. All scoring plugins must return success or
// the pod will be rejected.
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
ScoreExtensions() ScoreExtensions
}
4. kubelet
Pod 更新循环
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:syncLoop
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:syncLoopIteration
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:handler.HandlePodAdditions(u.Pods) 处理新增 pod 请求
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:dispatchWork 分发创建任务 podWorkers.UpdatePod 创建异步worker
- k8s.io/kubernetes/pkg/kubelet/pod_workers.go:UpdatePod
- k8s.io/kubernetes/pkg/kubelet/pod_workers.go:managePodLoop(outCh) 判断pod状态不存在,进入pod 同步方法
- k8s.io/kubernetes/pkg/kubelet/pod_workers.go:p.syncPodFn
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:klet.syncPod
- k8s.io/kubernetes/pkg/kubelet/kuberuntime/kuberuntime_manager.go:SyncPod
func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) { // Step 1: Compute sandbox and container changes. podContainerChanges := m.computePodActions(pod, podStatus) // ... // Step 2: Kill the pod if the sandbox has changed. if podContainerChanges.KillPod { if podContainerChanges.CreateSandbox { klog.V(4).InfoS("Stopping PodSandbox for pod, will start new one", "pod", klog.KObj(pod)) } else { klog.V(4).InfoS("Stopping PodSandbox for pod, because all other containers are dead", "pod", klog.KObj(pod)) } } else { // Step 3: kill any running containers in this pod which are not to keep. // Keep terminated init containers fairly aggressively controlled // This is an optimization because container removals are typically handled // by container garbage collector. // ... // Step 4: Create a sandbox for the pod if necessary. // 调用 containerd 接口,通常 k8s 沙箱容器为 pause,为了初始化 netns,其他容器通过 ipc 共享网络空间 podSandboxID, msg, err = m.createPodSandbox(ctx, pod, podContainerChanges.Attempt) // m.runtimeService.RunPodSandbox(ctx, podSandboxConfig, runtimeHandler) 调用 containerd 进行创建 //... // Step 5: start ephemeral containers // These are started "prior" to init containers to allow running ephemeral containers even when there // are errors starting an init container. In practice init containers will start first since ephemeral // containers cannot be specified on pod creation. for _, idx := range podContainerChanges.EphemeralContainersToStart { start(ctx, "ephemeral container", metrics.EphemeralContainer, ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx])) } // Step 6: start the init container. if container := podContainerChanges.NextInitContainerToStart; container != nil { // Start the next init container. if err := start(ctx, "init container", metrics.InitContainer, containerStartSpec(container)); err != nil { return } // Successfully started the container; clear the entry in the failure klog.V(4).InfoS("Completed init container for pod", "containerName", container.Name, "pod", klog.KObj(pod)) } // Step 7: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { start(ctx, "container", metrics.Container, containerStartSpec(&pod.Spec.Containers[idx])) } }
- k8s.io/kubernetes/pkg/kubelet/kuberuntime/kuberuntime_container.go:startContainer
- pull the image
- create the container
- start the container
- run the post start lifecycle hooks (if applicable)
func (m *kubeGenericRuntimeManager) startContainer(ctx context.Context, podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) { container := spec.container // Step 1: pull the image. imageRef, msg, err := m.imagePuller.EnsureImageExists(ctx, pod, container, pullSecrets, podSandboxConfig) if err != nil { s, _ := grpcstatus.FromError(err) m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) return msg, err } // Step 2: create the container. // For a new container, the RestartCount should be 0 // ... containerID, err := m.runtimeService.CreateContainer(ctx, podSandboxID, containerConfig, podSandboxConfig) if err != nil { s, _ := grpcstatus.FromError(err) m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) return s.Message(), ErrCreateContainer } err = m.internalLifecycle.PreStartContainer(pod, container, containerID) // Step 3: start the container. err = m.runtimeService.StartContainer(ctx, containerID) if err != nil { s, _ := grpcstatus.FromError(err) m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message()) return s.Message(), kubecontainer.ErrRunContainer } // Symlink container logs to the legacy container log location for cluster logging // support. // TODO(random-liu): Remove this after cluster logging supports CRI container log path. // ... // Step 4: execute the post start hook. if container.Lifecycle != nil && container.Lifecycle.PostStart != nil { msg, handlerErr := m.runner.Run(ctx, kubeContainerID, pod, container, container.Lifecycle.PostStart) // ... }
- k8s.io/kubernetes/pkg/kubelet/pod_workers.go:managePodLoop(outCh) 判断pod状态不存在,进入pod 同步方法
- k8s.io/kubernetes/pkg/kubelet/pod_workers.go:UpdatePod
- k8s.io/kubernetes/pkg/kubelet/kubelet.go:canAdmitPod
- 准入控制 podAdmitHandler.Admit(attrs)
- 节点是否满足 pod 亲和性规则 noderesources/nodeport/nodeAffinity/nodename
- 节点是否有足够的资源分配给 pod
- 节点是否满足安全上下文 HostNetwork还是HostIPC,如果是,是否在节点的白名单中
- 当前节点存在内存/磁盘压力等
- 准入控制 podAdmitHandler.Admit(attrs)
Lifecycle hooks
5. containerd
RunPodSandbox
docker/containerd/pkg/cri/server/sandbox_run.go:RunPodSandbox 初始化 pause 容器
- 确认镜像存在
- 生成容器描述
- label
- 初始化 fs
- 网络初始化 docker/containerd/pkg/cri/server/sandbox_run.go:setupPodNetwork
- /var/run/netns 网络名空间路径
- netPlugin.Setup 通过 cni 插件进行初始化
- docker/containerd/vendor/github.com/containerd/go-cni/cni.go:Setup
- docker/containerd/vendor/github.com/containernetworking/cni/libcni/api.go:invoke.ExecPluginWithResult(ctx, pluginPath, newConf.Bytes, c.args(“ADD”, rt), c.exec) 调用 cni
StartContainer
- pull image
- create container
- create task
// 获取 runtime cmd- docker/containerd/services/tasks/local.go:getRuntime => 调用 containerd-shim-runc-v2
// 创建容器入口 - docker/containerd/services/tasks/local.go:rtime.Create(ctx, r.ContainerID, opts) 启动 shim 实例
- docker/containerd/runtime/v2/manager.go:m.manager.Start
- NewBundle 初始化 disk work 目录等
- shimTask.Create(ctx, opts) 构建 shimTask,调用 runc 启动容器
func NewContainer(ctx context.Context, platform stdio.Platform, r *task.CreateTaskRequest) (_ *Container, retErr error) { // 获取 ns ns, err := namespaces.NamespaceRequired(ctx) // 省略部分 ... // task 创建配置 config := &process.CreateConfig{ ID: r.ID, Bundle: r.Bundle, Runtime: opts.BinaryName, Rootfs: pmounts, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Checkpoint: r.Checkpoint, ParentCheckpoint: r.ParentCheckpoint, Options: r.Options, } // 写入 option.json if err := WriteOptions(r.Bundle, opts); err != nil { return nil, err } // For historical reason, we write opts.BinaryName as well as the entire opts if err := WriteRuntime(r.Bundle, opts.BinaryName); err != nil { return nil, err } // 省略部分... // 挂载 if err := mount.All(mounts, rootfs); err != nil { return nil, fmt.Errorf("failed to mount rootfs component: %w", err) } // 初始化 runc 命令行 p, err := newInit( ctx, r.Bundle, filepath.Join(r.Bundle, "work"), ns, platform, config, opts, rootfs, ) // Create the process with the provided config // 这里会实际执行 runc 命令 if err := p.Create(ctx, config); err != nil { return nil, errdefs.ToGRPC(err) } container := &Container{ ID: r.ID, Bundle: r.Bundle, process: p, processes: make(map[string]process.Process), reservedProcess: make(map[string]struct{}), } pid := p.Pid() ... // 省略部分 return container, nil }
- docker/containerd/runtime/v2/manager.go:m.manager.Start
- docker/containerd/services/tasks/local.go:getRuntime => 调用 containerd-shim-runc-v2
查看容器根 fs
docker/containerd/runtime/v1/linux/runtime.go:newBundle
ls /var/run/containerd/io.containerd.runtime.v2.task/example/redis-server
address config.json init.pid log log.json options.json rootfs runtime shim-binary-path work
6. containerd-shim-runc-v2
func run(ctx context.Context, manager Manager, initFunc Init, name string, config Config) error {
// ... 省略部分
case "start":
opts := StartOpts{
Address: addressFlag,
TTRPCAddress: ttrpcAddress,
Debug: debugFlag,
}
// 启动入口
address, err := manager.Start(ctx, id, opts)
if err != nil {
return err
}
if _, err := os.Stdout.WriteString(address); err != nil {
return err
}
return nil
func (s *service) StartShim(ctx context.Context, opts shim.StartOpts) (_ string, retErr error) {
// 启动命令模版
cmd, err := newCommand(ctx, opts.ID, opts.Address, opts.TTRPCAddress)
// ... 省略部分
return address, nil
}
调用过程:
- /usr/bin/containerd-shim-runc-v2 -namespace example -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd-shim-runc-v2 -id redis-server --debug start (containerd-shim-runc-v2 调用自己)
- /usr/bin/containerd-shim-runc-v2 -namespace example -id redis-server -address /run/containerd/containerd.sock -debug (启动 ttrpc 服务)
- runc create --bundle /tmp/redis-server (创建容器)
7. runc
本地调试
(dlv) print cm
github.com/opencontainers/runc/libcontainer/cgroups.Manager(*github.com/opencontainers/runc/libcontainer/cgroups/fs.Manager) *{
mu: sync.Mutex {state: 0, sema: 0},
cgroups: *github.com/opencontainers/runc/libcontainer/configs.Cgroup {
Name: "",
Parent: "",
Path: "/example/redis-server",
ScopePrefix: "",
Resources: *(*"github.com/opencontainers/runc/libcontainer/configs.Resources")(0xc000218180),
Systemd: false,
SystemdProps: []github.com/coreos/go-systemd/v22/dbus.Property len: 0, cap: 0, nil,
Rootless: false,
OwnerUID: *int nil,},
paths: map[string]string [
"memory": "/sys/fs/cgroup/memory/example/redis-server",
"pids": "/sys/fs/cgroup/pids/example/redis-server",
"perf_event": "/sys/fs/cgroup/perf_event/example/redis-server",
"freezer": "/sys/fs/cgroup/freezer/example/redis-server",
"name=systemd": "/sys/fs/cgroup/systemd/example/redis-server",
"cpuset": "/sys/fs/cgroup/cpuset/example/redis-server",
"devices": "/sys/fs/cgroup/devices/example/redis-server",
"cpu": "/sys/fs/cgroup/cpu,cpuacct/example/redis-server",
"cpuacct": "/sys/fs/cgroup/cpu,cpuacct/example/redis-server",
"blkio": "/sys/fs/cgroup/blkio/example/redis-server",
"hugetlb": "/sys/fs/cgroup/hugetlb/example/redis-server",
"net_cls": "/sys/fs/cgroup/net_cls,net_prio/example/redis-server",
"net_prio": "/sys/fs/cgroup/net_cls,net_prio/example/redis-server",
],}