创建 deployment 每个组件如何响应?

kubernetes version: 1.26

场景: 创建一个 deployment

1. kube-apiserver

  1. api 接口收到创建请求
  2. 写入 etcd,更新资源
  3. kube-controller-manager 通过 watch deployment 资源变化

2. kube-controller-manager

  1. 初始化 controller (deploy, relicat)
  2. 进入 deploy 同步的 loop 中 k8s.io/kubernetes/pkg/controller/deployment/deployment_controller.go: syncDeployment
  3. 创建 replicaSet
  4. 进入 replicaSet 同步 loop 中 k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go:syncReplicaSet
  5. 创建 pod

各种控制器

func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc {
    // 省略部分。。。

	register("endpoint", startEndpointController)
	register("endpointslice", startEndpointSliceController)
	register("endpointslicemirroring", startEndpointSliceMirroringController)
	register("replicationcontroller", startReplicationController)
	register("podgc", startPodGCController)
	register("resourcequota", startResourceQuotaController)
	register("namespace", startNamespaceController)
	register("serviceaccount", startServiceAccountController)
	register("garbagecollector", startGarbageCollectorController)
	register("daemonset", startDaemonSetController)
	register("job", startJobController)
	register("deployment", startDeploymentController)
	register("replicaset", startReplicaSetController)
	register("horizontalpodautoscaling", startHPAController)
	register("disruption", startDisruptionController)
	register("statefulset", startStatefulSetController)
	register("cronjob", startCronJobController)
	register("csrsigning", startCSRSigningController)
	register("csrapproving", startCSRApprovingController)
	register("csrcleaner", startCSRCleanerController)
	register("ttl", startTTLController)
	register("bootstrapsigner", startBootstrapSignerController)
	register("tokencleaner", startTokenCleanerController)
	register("nodeipam", startNodeIpamController)
	register("nodelifecycle", startNodeLifecycleController)
	if loopMode == IncludeCloudLoops {
		register("service", startServiceController)
		register("route", startRouteController)
		register("cloud-node-lifecycle", startCloudNodeLifecycleController)
		// TODO: volume controller into the IncludeCloudLoops only set.
	}
	register("persistentvolume-binder", startPersistentVolumeBinderController)
	register("attachdetach", startAttachDetachController)
	register("persistentvolume-expander", startVolumeExpandController)
	register("clusterrole-aggregation", startClusterRoleAggregrationController)
	register("pvc-protection", startPVCProtectionController)
	register("pv-protection", startPVProtectionController)
	register("ttl-after-finished", startTTLAfterFinishedController)
	register("root-ca-cert-publisher", startRootCACertPublisher)
	register("ephemeral-volume", startEphemeralVolumeController)
	if utilfeature.DefaultFeatureGate.Enabled(genericfeatures.APIServerIdentity) &&
		utilfeature.DefaultFeatureGate.Enabled(genericfeatures.StorageVersionAPI) {
		register("storage-version-gc", startStorageVersionGCController)
	}
	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) {
		controllers["resource-claim-controller"] = startResourceClaimController
	}

	return controllers
}

3. kube-scheduler

// k8s 调度过程中通过插件进行筛选, 默认组件如下
k8s.io/kubernetes/pkg/scheduler/apis/config/v1/default_plugins.go


func getDefaultPlugins() *v1.Plugins {
	plugins := &v1.Plugins{
		MultiPoint: v1.PluginSet{
			Enabled: []v1.Plugin{
				{Name: names.PrioritySort},
				{Name: names.NodeUnschedulable},
				{Name: names.NodeName},
				{Name: names.TaintToleration, Weight: pointer.Int32(3)},
				{Name: names.NodeAffinity, Weight: pointer.Int32(2)},
				{Name: names.NodePorts},
				{Name: names.NodeResourcesFit, Weight: pointer.Int32(1)},
				{Name: names.VolumeRestrictions},
				{Name: names.EBSLimits},
				{Name: names.GCEPDLimits},
				{Name: names.NodeVolumeLimits},
				{Name: names.AzureDiskLimits},
				{Name: names.VolumeBinding},
				{Name: names.VolumeZone},
				{Name: names.PodTopologySpread, Weight: pointer.Int32(2)},
				{Name: names.InterPodAffinity, Weight: pointer.Int32(2)},
				{Name: names.DefaultPreemption},
				{Name: names.NodeResourcesBalancedAllocation, Weight: pointer.Int32(1)},
				{Name: names.ImageLocality, Weight: pointer.Int32(1)},
				{Name: names.DefaultBinder},
			},
		},
	}
	applyFeatureGates(plugins)

	return plugins
}

//调度入口:
// k8s.io/kubernetes/pkg/scheduler/schedule_one.go

func (sched *Scheduler) scheduleOne(ctx context.Context) {
	podInfo := sched.NextPod()
	// pod could be nil when schedulerQueue is closed
	if podInfo == nil || podInfo.Pod == nil {
		return
	}
	pod := podInfo.Pod
	// 根据 pod 选定调度器
	fwk, err := sched.frameworkForPod(pod)
   // 省略部分...

   // 进行调度
	scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate)
	if !status.IsSuccess() {
		sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start)
		return
	}
}
func (sched *Scheduler) schedulingCycle(
	ctx context.Context,
	state *framework.CycleState,
	fwk framework.Framework,
	podInfo *framework.QueuedPodInfo,
	start time.Time,
	podsToActivate *framework.PodsToActivate,
) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) {
	pod := podInfo.Pod
	// k8s.io/kubernetes/pkg/scheduler/schedule_one.go:schedulePod
	scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod)
   // 省略部分...
	return scheduleResult, assumedPodInfo, nil
}

// 真正的调度执行方法
func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
   // node 快照
	if err := sched.Cache.UpdateSnapshot(sched.nodeInfoSnapshot); err != nil {
		return result, err
	}
   // 判断是否存在可用 node 
	if sched.nodeInfoSnapshot.NumNodes() == 0 {
		return result, ErrNoNodesAvailable
	}

	feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod)
	if err != nil {
		return result, err
	}
   // 优选
	priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes)
	if err != nil {
		return result, err
	}
   // 从优选的列表中按照得分进行选取可用 node
	host, err := selectHost(priorityList)
	trace.Step("Prioritizing done")

	return ScheduleResult{
		SuggestedHost:  host,
		EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
		FeasibleNodes:  len(feasibleNodes),
	}, err
}
// 优选:
// k8s.io/kubernetes/pkg/scheduler/framework/interface.go
// 根据多种类型插件实现打分方法

type ScorePlugin interface {
	Plugin
	// Score is called on each filtered node. It must return success and an integer
	// indicating the rank of the node. All scoring plugins must return success or
	// the pod will be rejected.
	Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)

	// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
	ScoreExtensions() ScoreExtensions
}

4. kubelet

Pod 更新循环

  1. k8s.io/kubernetes/pkg/kubelet/kubelet.go:syncLoop
  2. k8s.io/kubernetes/pkg/kubelet/kubelet.go:syncLoopIteration
    1. k8s.io/kubernetes/pkg/kubelet/kubelet.go:handler.HandlePodAdditions(u.Pods) 处理新增 pod 请求
    2. k8s.io/kubernetes/pkg/kubelet/kubelet.go:dispatchWork 分发创建任务 podWorkers.UpdatePod 创建异步worker
      1. k8s.io/kubernetes/pkg/kubelet/pod_workers.go:UpdatePod
        1. k8s.io/kubernetes/pkg/kubelet/pod_workers.go:managePodLoop(outCh) 判断pod状态不存在,进入pod 同步方法
          1. k8s.io/kubernetes/pkg/kubelet/pod_workers.go:p.syncPodFn
          2. k8s.io/kubernetes/pkg/kubelet/kubelet.go:klet.syncPod
          3. k8s.io/kubernetes/pkg/kubelet/kuberuntime/kuberuntime_manager.go:SyncPod
          func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) {
          // Step 1: Compute sandbox and container changes.
          	podContainerChanges := m.computePodActions(pod, podStatus)
          // ...
          // Step 2: Kill the pod if the sandbox has changed.
          	if podContainerChanges.KillPod {
          		if podContainerChanges.CreateSandbox {
          			klog.V(4).InfoS("Stopping PodSandbox for pod, will start new one", "pod", klog.KObj(pod))
          		} else {
          			klog.V(4).InfoS("Stopping PodSandbox for pod, because all other containers are dead", "pod", klog.KObj(pod))
          		}
          	} else {
          // Step 3: kill any running containers in this pod which are not to keep.     
          // Keep terminated init containers fairly aggressively controlled
          // This is an optimization because container removals are typically handled
          // by container garbage collector.
          // ...
          // Step 4: Create a sandbox for the pod if necessary.
          // 调用 containerd 接口,通常 k8s 沙箱容器为 pause,为了初始化 netns,其他容器通过 ipc 共享网络空间
          		podSandboxID, msg, err = m.createPodSandbox(ctx, pod, podContainerChanges.Attempt)
          // m.runtimeService.RunPodSandbox(ctx, podSandboxConfig, runtimeHandler) 调用 containerd 进行创建
          //...
          // Step 5: start ephemeral containers
          // These are started "prior" to init containers to allow running ephemeral containers even when there
          // are errors starting an init container. In practice init containers will start first since ephemeral
          // containers cannot be specified on pod creation.
          	for _, idx := range podContainerChanges.EphemeralContainersToStart {
          		start(ctx, "ephemeral container", metrics.EphemeralContainer, ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx]))
          	}
          // Step 6: start the init container.
          	if container := podContainerChanges.NextInitContainerToStart; container != nil {
          // Start the next init container.
          		if err := start(ctx, "init container", metrics.InitContainer, containerStartSpec(container)); err != nil {
          			return
          		}
          // Successfully started the container; clear the entry in the failure
          		klog.V(4).InfoS("Completed init container for pod", "containerName", container.Name, "pod", klog.KObj(pod))
          	}
          // Step 7: start containers in podContainerChanges.ContainersToStart.
              	for _, idx := range podContainerChanges.ContainersToStart {
              		start(ctx, "container", metrics.Container, containerStartSpec(&pod.Spec.Containers[idx]))
                 }
              }
          
          1. k8s.io/kubernetes/pkg/kubelet/kuberuntime/kuberuntime_container.go:startContainer
            1. pull the image
            2. create the container
            3. start the container
            4. run the post start lifecycle hooks (if applicable)
            func (m *kubeGenericRuntimeManager) startContainer(ctx context.Context, podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec        *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
               container := spec.container
            // Step 1: pull the image.
            imageRef, msg, err := m.imagePuller.EnsureImageExists(ctx, pod, container, pullSecrets, podSandboxConfig)
            if err != nil {
            	s, _ := grpcstatus.FromError(err)
            	m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
            	return msg, err
            }
            // Step 2: create the container.
            // For a new container, the RestartCount should be 0
            // ...
            containerID, err := m.runtimeService.CreateContainer(ctx, podSandboxID, containerConfig, podSandboxConfig)
            if err != nil {
            	s, _ := grpcstatus.FromError(err)
            	m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
            	return s.Message(), ErrCreateContainer
            }
            err = m.internalLifecycle.PreStartContainer(pod, container, containerID)
            // Step 3: start the container.
            err = m.runtimeService.StartContainer(ctx, containerID)
            if err != nil {
            	s, _ := grpcstatus.FromError(err)
            	m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message())
            	return s.Message(), kubecontainer.ErrRunContainer
            }
            // Symlink container logs to the legacy container log location for cluster logging
            // support.
            // TODO(random-liu): Remove this after cluster logging supports CRI container log path.
            // ...
            // Step 4: execute the post start hook.
            if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
            	msg, handlerErr := m.runner.Run(ctx, kubeContainerID, pod, container, container.Lifecycle.PostStart)
            // ...
            }
            
    3. k8s.io/kubernetes/pkg/kubelet/kubelet.go:canAdmitPod
      1. 准入控制 podAdmitHandler.Admit(attrs)
        1. 节点是否满足 pod 亲和性规则 noderesources/nodeport/nodeAffinity/nodename
        2. 节点是否有足够的资源分配给 pod
        3. 节点是否满足安全上下文 HostNetwork还是HostIPC,如果是,是否在节点的白名单中
        4. 当前节点存在内存/磁盘压力等

Lifecycle hooks

5. containerd

RunPodSandbox

docker/containerd/pkg/cri/server/sandbox_run.go:RunPodSandbox 初始化 pause 容器

  1. 确认镜像存在
  2. 生成容器描述
  3. label
  4. 初始化 fs
  5. 网络初始化 docker/containerd/pkg/cri/server/sandbox_run.go:setupPodNetwork
    1. /var/run/netns 网络名空间路径
    2. netPlugin.Setup 通过 cni 插件进行初始化
      1. docker/containerd/vendor/github.com/containerd/go-cni/cni.go:Setup
      2. docker/containerd/vendor/github.com/containernetworking/cni/libcni/api.go:invoke.ExecPluginWithResult(ctx, pluginPath, newConf.Bytes, c.args(“ADD”, rt), c.exec) 调用 cni

StartContainer

  1. pull image
  2. create container
  3. create task
    // 获取 runtime cmd
    1. docker/containerd/services/tasks/local.go:getRuntime => 调用 containerd-shim-runc-v2
      // 创建容器入口
    2. docker/containerd/services/tasks/local.go:rtime.Create(ctx, r.ContainerID, opts) 启动 shim 实例
      1. docker/containerd/runtime/v2/manager.go:m.manager.Start
        1. NewBundle 初始化 disk work 目录等
        2. shimTask.Create(ctx, opts) 构建 shimTask,调用 runc 启动容器
          func NewContainer(ctx context.Context, platform stdio.Platform, r *task.CreateTaskRequest) (_ *Container, retErr error) {
          // 获取 ns
          	ns, err := namespaces.NamespaceRequired(ctx)
          // 省略部分 ...
          // task 创建配置
              config := &process.CreateConfig{
          		ID:               r.ID,
          		Bundle:           r.Bundle,
          		Runtime:          opts.BinaryName,
          		Rootfs:           pmounts,
          		Terminal:         r.Terminal,
          		Stdin:            r.Stdin,
          		Stdout:           r.Stdout,
          		Stderr:           r.Stderr,
          		Checkpoint:       r.Checkpoint,
          		ParentCheckpoint: r.ParentCheckpoint,
          		Options:          r.Options,
          	}
          // 写入 option.json 
          	if err := WriteOptions(r.Bundle, opts); err != nil {
          		return nil, err
          	}
          // For historical reason, we write opts.BinaryName as well as the entire opts
          	if err := WriteRuntime(r.Bundle, opts.BinaryName); err != nil {
          		return nil, err
          	}
          // 省略部分...
          // 挂载
          	if err := mount.All(mounts, rootfs); err != nil {
          		return nil, fmt.Errorf("failed to mount rootfs component: %w", err)
          	}
              // 初始化 runc 命令行
          	p, err := newInit(
          		ctx,
          		r.Bundle,
          		filepath.Join(r.Bundle, "work"),
          		ns,
          		platform,
          		config,
          		opts,
          		rootfs,
          	)
              // Create the process with the provided config
              // 这里会实际执行 runc 命令
          	if err := p.Create(ctx, config); err != nil {
          		return nil, errdefs.ToGRPC(err)
          	}
          	container := &Container{
          		ID:              r.ID,
          		Bundle:          r.Bundle,
          		process:         p,
          		processes:       make(map[string]process.Process),
          		reservedProcess: make(map[string]struct{}),
          	}
          	pid := p.Pid()
          ... // 省略部分
          	return container, nil
          }
                          
          

查看容器根 fs

docker/containerd/runtime/v1/linux/runtime.go:newBundle

ls /var/run/containerd/io.containerd.runtime.v2.task/example/redis-server
address  config.json  init.pid  log  log.json  options.json  rootfs  runtime  shim-binary-path  work

6. containerd-shim-runc-v2


func run(ctx context.Context, manager Manager, initFunc Init, name string, config Config) error {
    // ... 省略部分
	case "start":
		opts := StartOpts{
			Address:      addressFlag,
			TTRPCAddress: ttrpcAddress,
			Debug:        debugFlag,
		}
      // 启动入口
		address, err := manager.Start(ctx, id, opts)
		if err != nil {
			return err
		}
		if _, err := os.Stdout.WriteString(address); err != nil {
			return err
		}
		return nil
		
		
		
func (s *service) StartShim(ctx context.Context, opts shim.StartOpts) (_ string, retErr error) {
	// 启动命令模版
	cmd, err := newCommand(ctx, opts.ID, opts.Address, opts.TTRPCAddress)
    // ... 省略部分
	return address, nil
}

调用过程:

  1. /usr/bin/containerd-shim-runc-v2 -namespace example -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd-shim-runc-v2 -id redis-server --debug start (containerd-shim-runc-v2 调用自己)
  2. /usr/bin/containerd-shim-runc-v2 -namespace example -id redis-server -address /run/containerd/containerd.sock -debug (启动 ttrpc 服务)
  3. runc create --bundle /tmp/redis-server (创建容器)

7. runc

本地调试

(dlv) print cm
github.com/opencontainers/runc/libcontainer/cgroups.Manager(*github.com/opencontainers/runc/libcontainer/cgroups/fs.Manager) *{
	mu: sync.Mutex {state: 0, sema: 0},
	cgroups: *github.com/opencontainers/runc/libcontainer/configs.Cgroup {
		Name: "",
		Parent: "",
		Path: "/example/redis-server",
		ScopePrefix: "",
		Resources: *(*"github.com/opencontainers/runc/libcontainer/configs.Resources")(0xc000218180),
		Systemd: false,
		SystemdProps: []github.com/coreos/go-systemd/v22/dbus.Property len: 0, cap: 0, nil,
		Rootless: false,
		OwnerUID: *int nil,},
	paths: map[string]string [
		"memory": "/sys/fs/cgroup/memory/example/redis-server",
		"pids": "/sys/fs/cgroup/pids/example/redis-server",
		"perf_event": "/sys/fs/cgroup/perf_event/example/redis-server",
		"freezer": "/sys/fs/cgroup/freezer/example/redis-server",
		"name=systemd": "/sys/fs/cgroup/systemd/example/redis-server",
		"cpuset": "/sys/fs/cgroup/cpuset/example/redis-server",
		"devices": "/sys/fs/cgroup/devices/example/redis-server",
		"cpu": "/sys/fs/cgroup/cpu,cpuacct/example/redis-server",
		"cpuacct": "/sys/fs/cgroup/cpu,cpuacct/example/redis-server",
		"blkio": "/sys/fs/cgroup/blkio/example/redis-server",
		"hugetlb": "/sys/fs/cgroup/hugetlb/example/redis-server",
		"net_cls": "/sys/fs/cgroup/net_cls,net_prio/example/redis-server",
		"net_prio": "/sys/fs/cgroup/net_cls,net_prio/example/redis-server",
	],}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值