Skip to content

AlmaLinux 10 Kubernetes 集群部署指南

本指南详细介绍如何在 AlmaLinux 10 上部署生产级 Kubernetes 集群,包括 kubeadm、containerd 配置和企业级最佳实践。

部署概述

Kubernetes 技术栈

yaml
容器运行时: containerd 1.7+ (推荐)
网络插件: Calico / Flannel / Cilium
存储: Longhorn / Rook-Ceph / NFS-CSI
负载均衡: MetalLB / HAProxy + Keepalived
监控: Prometheus + Grafana
日志: ELK Stack / Loki
镜像仓库: Harbor / 阿里云 ACR

集群规划

组件类型最小配置推荐配置高可用配置
Master 节点1C2G2C4G3节点x4C8G
Worker 节点2C4G4C8G5+节点x8C16G
etcd 存储20GB SSD50GB SSD独立3节点x100GB
容器镜像存储50GB200GB500GB+
网络带宽1Gbps10Gbps25Gbps+

第一阶段:系统环境准备

1.1 系统基础配置

主机名和网络设置

bash
# 设置主机名 (每个节点执行)
hostnamectl set-hostname k8s-master01  # 根据实际节点调整
hostnamectl set-hostname k8s-worker01
hostnamectl set-hostname k8s-worker02

# 配置 hosts 文件
cat >> /etc/hosts << 'EOF'
# Kubernetes 集群节点
192.168.100.10 k8s-master01
192.168.100.20 k8s-worker01
192.168.100.21 k8s-worker02
EOF

# 禁用 swap
swapoff -a
sed -i '/swap/d' /etc/fstab

# 确认 swap 已禁用
free -h | grep -i swap

内核模块和网络配置

bash
# 加载必要的内核模块
cat > /etc/modules-load.d/kubernetes.conf << 'EOF'
overlay
br_netfilter
EOF

modprobe overlay
modprobe br_netfilter

# 配置网络参数
cat > /etc/sysctl.d/99-kubernetes.conf << 'EOF'
# Kubernetes 网络配置
net.bridge.bridge-nf-call-iptables  = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward                 = 1

# 性能优化
vm.max_map_count = 262144
fs.file-max = 2097152
net.core.somaxconn = 32768
net.ipv4.tcp_max_syn_backlog = 8192
EOF

# 应用配置
sysctl --system

1.2 容器运行时安装

安装和配置 containerd

bash
# 添加 Docker 官方仓库
dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo

# 安装 containerd
dnf install -y containerd.io

# 创建 containerd 配置目录
mkdir -p /etc/containerd

# 生成默认配置
containerd config default > /etc/containerd/config.toml

# 配置 systemd cgroup 驱动
sed -i 's/SystemdCgroup = false/SystemdCgroup = true/' /etc/containerd/config.toml

# 配置国内镜像加速
cat > /etc/containerd/config.toml << 'EOF'
version = 2

[plugins."io.containerd.grpc.v1.cri"]
  sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"
  
  [plugins."io.containerd.grpc.v1.cri".containerd]
    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
        runtime_type = "io.containerd.runc.v2"
        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
          SystemdCgroup = true

  [plugins."io.containerd.grpc.v1.cri".registry]
    [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
        endpoint = ["https://docker.mirrors.ustc.edu.cn"]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."k8s.gcr.io"]
        endpoint = ["https://registry.aliyuncs.com/google_containers"]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"]
        endpoint = ["https://registry.aliyuncs.com/google_containers"]
EOF

# 启动 containerd
systemctl enable --now containerd
systemctl status containerd

第二阶段:Kubernetes 组件安装

2.1 安装 kubeadm、kubelet、kubectl

配置 Kubernetes 软件源

bash
# 添加 Kubernetes 官方源 (中科大镜像)
cat > /etc/yum.repos.d/kubernetes.repo << 'EOF'
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.ustc.edu.cn/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://mirrors.ustc.edu.cn/kubernetes/yum/doc/yum-key.gpg https://mirrors.ustc.edu.cn/kubernetes/yum/doc/rpm-package-key.gpg
EOF

# 安装 Kubernetes 组件
dnf install -y kubelet kubeadm kubectl --disableexcludes=kubernetes

# 锁定版本,避免意外升级
dnf mark kubelet kubeadm kubectl

# 启用 kubelet (还不能启动,需要集群初始化)
systemctl enable kubelet

2.2 Master 节点初始化

集群初始化配置

bash
# 创建集群初始化配置文件
cat > /root/kubeadm-config.yaml << 'EOF'
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
kubernetesVersion: v1.28.0
clusterName: almalinux-k8s
controlPlaneEndpoint: "k8s-master01:6443"
imageRepository: registry.aliyuncs.com/google_containers
networking:
  serviceSubnet: "10.96.0.0/12"
  podSubnet: "10.244.0.0/16"
  dnsDomain: "cluster.local"
etcd:
  local:
    dataDir: "/var/lib/etcd"
apiServer:
  bindPort: 6443
  advertiseAddress: "192.168.100.10"  # 修改为实际 Master IP
controllerManager: {}
scheduler: {}
---
apiVersion: kubeadm.k8s.io/v1beta3
kind: InitConfiguration
localAPIEndpoint:
  advertiseAddress: "192.168.100.10"  # 修改为实际 Master IP
  bindPort: 6443
nodeRegistration:
  criSocket: unix:///var/run/containerd/containerd.sock
  kubeletExtraArgs:
    cgroup-driver: systemd
    container-runtime-endpoint: unix:///var/run/containerd/containerd.sock
---
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
cgroupDriver: systemd
containerRuntimeEndpoint: unix:///var/run/containerd/containerd.sock
EOF

# 预拉取镜像
kubeadm config images pull --config=/root/kubeadm-config.yaml

# 初始化集群
kubeadm init --config=/root/kubeadm-config.yaml

# 配置 kubectl
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config

# 验证 Master 节点状态
kubectl get nodes
kubectl get pods -n kube-system

2.3 网络插件安装

安装 Calico 网络插件 (推荐)

bash
# 下载 Calico 配置文件
curl https://docs.projectcalico.org/manifests/calico.yaml -O

# 修改 Pod CIDR (如果需要)
sed -i 's|# - name: CALICO_IPV4POOL_CIDR|  - name: CALICO_IPV4POOL_CIDR|' calico.yaml
sed -i 's|#   value: "192.168.0.0/16"|    value: "10.244.0.0/16"|' calico.yaml

# 应用 Calico
kubectl apply -f calico.yaml

# 等待 Calico 启动
kubectl wait --for=condition=Ready pods -l k8s-app=calico-node -n kube-system --timeout=300s

# 验证网络插件
kubectl get pods -n kube-system | grep calico

第三阶段:Worker 节点加入

3.1 Worker 节点配置

获取加入令牌

bash
# 在 Master 节点执行,获取 join 命令
kubeadm token create --print-join-command

# 输出示例:
# kubeadm join k8s-master01:6443 --token abcdef.1234567890abcdef \
#     --discovery-token-ca-cert-hash sha256:1234567890abcdef...

Worker 节点加入集群

bash
# 在每个 Worker 节点执行 (替换为实际的 join 命令)
kubeadm join k8s-master01:6443 --token abcdef.1234567890abcdef \
    --discovery-token-ca-cert-hash sha256:1234567890abcdef...

# 在 Master 节点验证
kubectl get nodes -o wide

3.2 集群验证和测试

部署测试应用

bash
# 创建测试 Deployment
cat > /tmp/nginx-test.yaml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
  name: nginx-test
  labels:
    app: nginx-test
spec:
  replicas: 3
  selector:
    matchLabels:
      app: nginx-test
  template:
    metadata:
      labels:
        app: nginx-test
    spec:
      containers:
      - name: nginx
        image: nginx:1.20
        ports:
        - containerPort: 80
        resources:
          requests:
            memory: "64Mi"
            cpu: "100m"
          limits:
            memory: "128Mi"
            cpu: "200m"
---
apiVersion: v1
kind: Service
metadata:
  name: nginx-test-service
spec:
  selector:
    app: nginx-test
  ports:
    - protocol: TCP
      port: 80
      targetPort: 80
  type: ClusterIP
EOF

# 部署测试应用
kubectl apply -f /tmp/nginx-test.yaml

# 验证部署
kubectl get pods -l app=nginx-test
kubectl get services nginx-test-service

# 测试服务连通性
kubectl run curl-test --image=curlimages/curl --rm -it --restart=Never -- \
  curl http://nginx-test-service.default.svc.cluster.local

# 清理测试资源
kubectl delete -f /tmp/nginx-test.yaml
kubectl delete pod curl-test

第四阶段:生产级别配置

4.1 存储解决方案

部署 Longhorn 分布式存储

bash
# 安装依赖
dnf install -y iscsi-initiator-utils

# 启动 iSCSI 服务
systemctl enable --now iscsid

# 安装 Longhorn
kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml

# 等待 Longhorn 启动
kubectl wait --for=condition=Ready pods -n longhorn-system --timeout=600s

# 访问 Longhorn UI (可选)
kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80 &

# 创建 StorageClass
cat > /tmp/longhorn-storageclass.yaml << 'EOF'
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: longhorn-fast
  annotations:
    storageclass.kubernetes.io/is-default-class: "true"
provisioner: driver.longhorn.io
allowVolumeExpansion: true
reclaimPolicy: Retain
volumeBindingMode: Immediate
parameters:
  numberOfReplicas: "2"
  staleReplicaTimeout: "2880"
  fromBackup: ""
  fsType: "ext4"
EOF

kubectl apply -f /tmp/longhorn-storageclass.yaml

4.2 监控系统部署

安装 Prometheus + Grafana

bash
# 添加 Prometheus 社区 Helm 仓库
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

# 添加仓库
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update

# 创建监控命名空间
kubectl create namespace monitoring

# 安装 kube-prometheus-stack
helm install prometheus prometheus-community/kube-prometheus-stack \
  --namespace monitoring \
  --set grafana.adminPassword=admin123 \
  --set prometheus.prometheusSpec.retention=30d \
  --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName=longhorn-fast \
  --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi

# 验证监控组件
kubectl get pods -n monitoring
kubectl get svc -n monitoring

# 访问 Grafana (端口转发)
kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 &
# 浏览器访问: http://localhost:3000 (admin/admin123)

4.3 日志收集系统

部署 ELK Stack

bash
# 安装 Elasticsearch
cat > /tmp/elasticsearch.yaml << 'EOF'
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: elasticsearch
  namespace: kube-system
spec:
  serviceName: elasticsearch
  replicas: 1
  selector:
    matchLabels:
      app: elasticsearch
  template:
    metadata:
      labels:
        app: elasticsearch
    spec:
      containers:
      - name: elasticsearch
        image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
        env:
        - name: discovery.type
          value: single-node
        - name: ES_JAVA_OPTS
          value: "-Xms512m -Xmx512m"
        ports:
        - containerPort: 9200
        volumeMounts:
        - name: elasticsearch-storage
          mountPath: /usr/share/elasticsearch/data
        resources:
          requests:
            memory: "1Gi"
            cpu: "500m"
          limits:
            memory: "2Gi"
            cpu: "1000m"
  volumeClaimTemplates:
  - metadata:
      name: elasticsearch-storage
    spec:
      accessModes: ["ReadWriteOnce"]
      storageClassName: longhorn-fast
      resources:
        requests:
          storage: 20Gi
---
apiVersion: v1
kind: Service
metadata:
  name: elasticsearch
  namespace: kube-system
spec:
  selector:
    app: elasticsearch
  ports:
  - port: 9200
    targetPort: 9200
EOF

kubectl apply -f /tmp/elasticsearch.yaml

# 安装 Fluent Bit 日志收集器
helm repo add fluent https://fluent.github.io/helm-charts
helm install fluent-bit fluent/fluent-bit \
  --namespace kube-system \
  --set config.outputs="[OUTPUT]\n    Name es\n    Match *\n    Host elasticsearch.kube-system.svc.cluster.local\n    Port 9200\n    Index kubernetes\n    Type _doc"

第五阶段:高可用和安全配置

5.1 Master 节点高可用

配置多 Master 集群

bash
# 在现有 Master 节点生成证书和配置
kubeadm init phase upload-certs --upload-certs

# 获取新 Master 加入命令
kubeadm token create --print-join-command --certificate-key $(kubeadm init phase upload-certs --upload-certs 2>/dev/null | tail -1)

# 在新 Master 节点执行 (替换为实际命令)
kubeadm join k8s-master01:6443 --token abcdef.1234567890abcdef \
    --discovery-token-ca-cert-hash sha256:1234567890abcdef... \
    --control-plane --certificate-key 1234567890abcdef...

# 配置负载均衡器 (HAProxy + Keepalived)
dnf install -y haproxy keepalived

# HAProxy 配置示例
cat > /etc/haproxy/haproxy.cfg << 'EOF'
global
    log stdout local0
    chroot /var/lib/haproxy
    stats socket /run/haproxy/admin.sock mode 660 level admin
    stats timeout 30s
    user haproxy
    group haproxy
    daemon

defaults
    mode tcp
    log global
    option tcplog
    option dontlognull
    option redispatch
    retries 3
    timeout queue 1m
    timeout connect 10s
    timeout client 1m
    timeout server 1m
    timeout check 10s
    maxconn 3000

frontend k8s-api
    bind *:6443
    default_backend k8s-masters

backend k8s-masters
    balance roundrobin
    server master1 192.168.100.10:6443 check
    server master2 192.168.100.11:6443 check
    server master3 192.168.100.12:6443 check
EOF

systemctl enable --now haproxy

5.2 集群安全加固

RBAC 权限控制

bash
# 创建只读用户
cat > /tmp/readonly-user.yaml << 'EOF'
apiVersion: v1
kind: ServiceAccount
metadata:
  name: readonly-user
  namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: readonly
rules:
- apiGroups: [""]
  resources: ["*"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
  resources: ["*"]
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: readonly-user
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: readonly
subjects:
- kind: ServiceAccount
  name: readonly-user
  namespace: default
EOF

kubectl apply -f /tmp/readonly-user.yaml

Pod 安全策略

bash
# 启用 Pod Security Standards
cat > /tmp/pod-security-policy.yaml << 'EOF'
apiVersion: v1
kind: Namespace
metadata:
  name: secure-namespace
  labels:
    pod-security.kubernetes.io/enforce: restricted
    pod-security.kubernetes.io/audit: restricted
    pod-security.kubernetes.io/warn: restricted
EOF

kubectl apply -f /tmp/pod-security-policy.yaml

故障排查和维护

常见问题解决

集群状态检查

bash
# 全面集群健康检查脚本
cat > /usr/local/bin/k8s-health-check.sh << 'EOF'
#!/bin/bash

echo "=== Kubernetes 集群健康检查 ==="

# 检查节点状态
echo "1. 节点状态:"
kubectl get nodes

# 检查系统 Pod
echo -e "\n2. 系统 Pod 状态:"
kubectl get pods -n kube-system

# 检查网络插件
echo -e "\n3. 网络插件状态:"
kubectl get pods -n kube-system | grep -E "(calico|flannel|cilium)"

# 检查存储状态
echo -e "\n4. 存储状态:"
kubectl get storageclass
kubectl get pv

# 检查服务状态
echo -e "\n5. 重要服务状态:"
kubectl get svc -A | grep -E "(kubernetes|kube-dns|coredns)"

# 检查资源使用
echo -e "\n6. 资源使用情况:"
kubectl top nodes 2>/dev/null || echo "Metrics server 未安装"
kubectl top pods -A 2>/dev/null || echo "Metrics server 未安装"

echo -e "\n=== 健康检查完成 ==="
EOF

chmod +x /usr/local/bin/k8s-health-check.sh
/usr/local/bin/k8s-health-check.sh

集群备份和恢复

bash
# etcd 备份脚本
cat > /usr/local/bin/etcd-backup.sh << 'EOF'
#!/bin/bash

BACKUP_DIR="/backup/etcd/$(date +%Y%m%d_%H%M%S)"
mkdir -p $BACKUP_DIR

# 备份 etcd 数据
ETCDCTL_API=3 etcdctl --endpoints=https://127.0.0.1:2379 \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
  --key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
  snapshot save $BACKUP_DIR/etcd-snapshot.db

# 备份 kubernetes 配置
cp -r /etc/kubernetes $BACKUP_DIR/

echo "etcd 备份完成: $BACKUP_DIR"
EOF

chmod +x /usr/local/bin/etcd-backup.sh

# 配置定期备份
echo "0 2 * * * /usr/local/bin/etcd-backup.sh" | crontab -

总结: 本指南提供了在 AlmaLinux 10 上部署生产级 Kubernetes 集群的完整流程,包括容器运行时配置、网络插件安装、存储解决方案、监控系统和安全加固。通过遵循这些最佳实践,可以构建一个稳定、可扩展的 Kubernetes 平台。

相关文档:

基于 MIT 许可发布