跳过正文
  1. 博客文章/

CoreDNS 完整部署与管理指南

·4292 字·21 分钟·
DNS 网络服务 系统管理 Coredns Dns Systemd 网络服务 服务器管理 容器化
Zayn
作者
Zayn
专注 Kubernetes、CI/CD、可观测性等云原生技术栈,记录生产环境中的实战经验与踩坑复盘。
目录

CoreDNS 是现代化的 DNS 服务器,以其插件化架构、高性能和易配置性成为 Kubernetes 默认 DNS 解决方案。本指南将从基础部署到企业级应用,全面介绍 CoreDNS 的部署、配置、优化和管理。

CoreDNS 概述
#

什么是 CoreDNS
#

CoreDNS 是用 Go 语言编写的现代化 DNS 服务器,具有以下核心特性:

核心优势
#

  • 插件化架构: 基于 Caddy 框架,支持丰富的插件生态
  • 云原生设计: Kubernetes 1.13+ 默认 DNS 服务器
  • 高性能: Go 语言编写,支持高并发处理
  • 配置简单: 使用 Corefile DSL 语法,易于理解和维护
  • 服务发现: 支持多种服务发现机制
  • 可观测性: 内置监控和日志功能

应用场景
#

graph TD
    A["CoreDNS 应用场景"] --> B["企业内网 DNS"]
    A --> C["Kubernetes 集群 DNS"]
    A --> D["服务发现"]
    A --> E["负载均衡"]
    A --> F["DNS 代理/转发"]
    A --> G["广告拦截"]
    A --> H["安全过滤"]

    B --> B1["内网域名解析"]
    B --> B2["主机名管理"]
    B --> B3["服务注册"]

    C --> C1["Pod 域名解析"]
    C --> C2["Service 发现"]
    C --> C3["跨命名空间通信"]

    D --> D1["微服务注册"]
    D --> D2["动态配置"]
    D --> D3["健康检查"]

插件生态系统
#

CoreDNS 的强大之处在于其丰富的插件系统:

核心插件
#

插件类别插件名称功能描述
基础功能hosts静态主机记录
file区域文件解析
forwardDNS 转发
cacheDNS 缓存
服务发现kubernetesK8s 服务发现
etcdetcd 服务发现
consulConsul 服务发现
负载均衡loadbalance负载均衡
health健康检查
监控日志log访问日志
metricsPrometheus 指标
trace链路追踪
安全功能acl访问控制
dnssecDNSSEC 支持
blocklist域名黑名单

环境准备
#

系统要求
#

# 支持的操作系统
- Linux (推荐 Ubuntu 20.04+, CentOS 8+)
- Windows Server 2019+
- macOS 10.15+

# 硬件要求
- CPU: 1 核心以上
- 内存: 512MB 以上(推荐 2GB+)
- 磁盘: 100MB 以上
- 网络: 支持 UDP/TCP 53 端口

# 依赖软件
- systemd (Linux 服务管理)
- Docker (容器化部署)
- Kubernetes (集群部署)

网络规划
#

# 示例环境配置
主服务器: 192.168.1.10 (主 DNS)
备用服务器: 192.168.1.11 (备 DNS)
管理网段: 192.168.1.0/24
服务网段: 10.0.0.0/16

二进制部署方式
#

自动化安装脚本
#

#!/bin/bash
# CoreDNS 自动化安装脚本

set -euo pipefail

# 配置变量
COREDNS_VERSION="1.11.1"
INSTALL_DIR="/usr/local/bin"
CONFIG_DIR="/etc/coredns"
DATA_DIR="/var/lib/coredns"
LOG_DIR="/var/log/coredns"
USER="coredns"
GROUP="coredns"

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 检测系统架构
detect_arch() {
    local arch=$(uname -m)
    case $arch in
        x86_64)
            echo "amd64"
            ;;
        aarch64|arm64)
            echo "arm64"
            ;;
        armv7l)
            echo "arm"
            ;;
        *)
            log_error "Unsupported architecture: $arch"
            exit 1
            ;;
    esac
}

# 检测操作系统
detect_os() {
    if [[ "$OSTYPE" == "linux-gnu"* ]]; then
        echo "linux"
    elif [[ "$OSTYPE" == "darwin"* ]]; then
        echo "darwin"
    else
        log_error "Unsupported OS: $OSTYPE"
        exit 1
    fi
}

# 下载 CoreDNS
download_coredns() {
    local os=$(detect_os)
    local arch=$(detect_arch)
    local download_url="https://github.com/coredns/coredns/releases/download/v${COREDNS_VERSION}/coredns_${COREDNS_VERSION}_${os}_${arch}.tgz"

    log_info "Downloading CoreDNS v${COREDNS_VERSION} for ${os}/${arch}..."

    # 创建临时目录
    local temp_dir=$(mktemp -d)
    cd "$temp_dir"

    # 下载并解压
    if command -v wget >/dev/null 2>&1; then
        wget -q "$download_url" -O coredns.tgz
    elif command -v curl >/dev/null 2>&1; then
        curl -sL "$download_url" -o coredns.tgz
    else
        log_error "Neither wget nor curl found. Please install one of them."
        exit 1
    fi

    tar -xzf coredns.tgz

    # 安装二进制文件
    sudo mv coredns "$INSTALL_DIR/"
    sudo chmod +x "$INSTALL_DIR/coredns"

    # 清理临时文件
    cd /
    rm -rf "$temp_dir"

    log_info "CoreDNS binary installed to $INSTALL_DIR/coredns"
}

# 创建用户和目录
setup_user_and_dirs() {
    log_info "Creating user and directories..."

    # 创建用户
    if ! id "$USER" >/dev/null 2>&1; then
        sudo useradd --system --no-create-home --shell /sbin/nologin "$USER"
        log_info "Created user: $USER"
    fi

    # 创建目录
    sudo mkdir -p "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"
    sudo chown "$USER:$GROUP" "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"
    sudo chmod 755 "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"

    log_info "Created directories: $CONFIG_DIR, $DATA_DIR, $LOG_DIR"
}

# 生成配置文件
generate_config() {
    log_info "Generating CoreDNS configuration..."

    sudo tee "$CONFIG_DIR/Corefile" > /dev/null << 'EOF'
# CoreDNS 主配置文件
# 监听所有接口的 53 端口
.:53 {
    # 绑定地址
    bind 0.0.0.0

    # 静态主机记录
    hosts {
        # 自定义主机记录
        ttl 60
        reload 1m
        fallthrough
    }

    # DNS 转发
    forward . 8.8.8.8 8.8.4.4 1.1.1.1 {
        max_fails 3
        expire 10s
        health_check 5s
        policy sequential
    }

    # DNS 缓存
    cache {
        success 65536 3600 300
        denial 8192 600 60
        prefetch 1 60m 10%
    }

    # 自动重载配置
    reload 6s

    # 负载均衡
    loadbalance round_robin

    # 日志记录
    log {
        class error
    }

    # 错误处理
    errors

    # 健康检查
    health :8080

    # Prometheus 监控
    prometheus :9153
}

# 内网域名解析
local.lan:53 {
    bind 0.0.0.0

    file /etc/coredns/zones/local.lan.zone

    log {
        class all
    }

    errors
}
EOF

    sudo chown "$USER:$GROUP" "$CONFIG_DIR/Corefile"
    log_info "Generated Corefile at $CONFIG_DIR/Corefile"
}

# 创建区域文件
create_zone_files() {
    log_info "Creating zone files..."

    sudo mkdir -p "$CONFIG_DIR/zones"

    # 创建本地域名区域文件
    sudo tee "$CONFIG_DIR/zones/local.lan.zone" > /dev/null << 'EOF'
$ORIGIN local.lan.
$TTL 300

@   IN  SOA ns1.local.lan. admin.local.lan. (
    2023120101  ; Serial
    3600        ; Refresh
    1800        ; Retry
    604800      ; Expire
    300         ; Minimum TTL
)

@           IN  NS      ns1.local.lan.
ns1         IN  A       192.168.1.10

; 示例主机记录
router      IN  A       192.168.1.1
server1     IN  A       192.168.1.10
server2     IN  A       192.168.1.11
EOF

    sudo chown -R "$USER:$GROUP" "$CONFIG_DIR/zones"
    log_info "Created zone files in $CONFIG_DIR/zones/"
}

# 主函数
main() {
    log_info "Starting CoreDNS installation..."

    # 检查权限
    if [[ $EUID -ne 0 ]] && ! sudo -n true 2>/dev/null; then
        log_error "This script requires sudo privileges"
        exit 1
    fi

    download_coredns
    setup_user_and_dirs
    generate_config
    create_zone_files

    log_info "CoreDNS installation completed!"
    log_info "Next steps:"
    log_info "1. Review configuration: $CONFIG_DIR/Corefile"
    log_info "2. Create systemd service: systemctl enable coredns"
    log_info "3. Start service: systemctl start coredns"
}

# 执行主函数
main "$@"

手动安装步骤
#

1. 下载和安装二进制文件
#

# 设置版本和架构
COREDNS_VERSION="1.11.1"
ARCH=$(uname -m)

# 根据架构设置下载链接
case $ARCH in
    x86_64)
        DOWNLOAD_ARCH="amd64"
        ;;
    aarch64|arm64)
        DOWNLOAD_ARCH="arm64"
        ;;
    armv7l)
        DOWNLOAD_ARCH="arm"
        ;;
    *)
        echo "Unsupported architecture: $ARCH"
        exit 1
        ;;
esac

# 下载 CoreDNS
wget "https://github.com/coredns/coredns/releases/download/v${COREDNS_VERSION}/coredns_${COREDNS_VERSION}_linux_${DOWNLOAD_ARCH}.tgz"

# 解压和安装
tar -xzf "coredns_${COREDNS_VERSION}_linux_${DOWNLOAD_ARCH}.tgz"
sudo mv coredns /usr/local/bin/
sudo chmod +x /usr/local/bin/coredns

# 验证安装
/usr/local/bin/coredns -version

2. 创建用户和目录结构
#

# 创建系统用户
sudo useradd --system --no-create-home --shell /sbin/nologin coredns

# 创建目录结构
sudo mkdir -p /etc/coredns/{zones,conf.d}
sudo mkdir -p /var/lib/coredns
sudo mkdir -p /var/log/coredns

# 设置权限
sudo chown -R coredns:coredns /etc/coredns /var/lib/coredns /var/log/coredns
sudo chmod 755 /etc/coredns /var/lib/coredns /var/log/coredns

3. 高级配置文件
#

# 创建主配置文件
sudo tee /etc/coredns/Corefile > /dev/null << 'EOF'
# CoreDNS 主配置文件
# 全局配置块
.:53 {
    # 绑定地址和端口
    bind 0.0.0.0

    # 静态主机记录
    hosts /etc/coredns/hosts {
        ttl 60
        reload 1m
        fallthrough
    }

    # 区域文件
    file /etc/coredns/zones/local.lan.zone local.lan

    # DNS 转发配置
    forward . 8.8.8.8 8.8.4.4 1.1.1.1 {
        max_fails 3
        expire 10s
        health_check 5s
        policy sequential
        prefer_udp
    }

    # 缓存配置
    cache {
        success 65536 3600 300
        denial 8192 600 60
        prefetch 1 60m 10%
        serve_stale
    }

    # 负载均衡
    loadbalance round_robin

    # 自动重载
    reload 6s

    # 日志配置
    log {
        class error
        file /var/log/coredns/error.log
    }

    # 错误处理
    errors {
        consolidate 5m ".* i/o timeout"
        consolidate 30s ".*"
    }

    # 健康检查端点
    health :8080 {
        lameduck 5s
    }

    # Prometheus 监控
    prometheus :9153

    # 访问控制
    acl {
        allow net 192.168.0.0/16
        allow net 10.0.0.0/8
        allow net 172.16.0.0/12
        block net 0.0.0.0/0
    }
}

# 内网域名配置
internal.local:53 {
    bind 0.0.0.0

    file /etc/coredns/zones/internal.local.zone

    log {
        class all
        file /var/log/coredns/internal.log
    }

    errors
}

# 反向解析配置
1.168.192.in-addr.arpa:53 {
    bind 0.0.0.0

    file /etc/coredns/zones/192.168.1.rev

    log {
        class all
        file /var/log/coredns/reverse.log
    }

    errors
}
EOF

# 创建主机记录文件
sudo tee /etc/coredns/hosts > /dev/null << 'EOF'
# 静态主机记录
192.168.1.1     router.local gateway.local
192.168.1.10    dns1.local ns1.local
192.168.1.11    dns2.local ns2.local
192.168.1.20    web.local www.local
192.168.1.21    api.local
192.168.1.22    db.local database.local
EOF

# 设置权限
sudo chown coredns:coredns /etc/coredns/Corefile /etc/coredns/hosts

4. 创建 systemd 服务
#

# 创建现代化的 systemd 服务文件
sudo tee /etc/systemd/system/coredns.service > /dev/null << 'EOF'
[Unit]
Description=CoreDNS DNS Server
Documentation=https://coredns.io/manual/toc/
After=network-online.target
Wants=network-online.target
AssertFileIsExecutable=/usr/local/bin/coredns

[Service]
Type=simple
User=coredns
Group=coredns

# 安全配置
NoNewPrivileges=true
PrivateTmp=true
PrivateDevices=true
ProtectHome=true
ProtectSystem=strict
ReadWritePaths=/var/lib/coredns /var/log/coredns /etc/coredns

# 能力配置
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_SETGID CAP_SETUID
AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_SETGID CAP_SETUID

# 资源限制
LimitNOFILE=1048576
LimitNPROC=1048576
LimitCORE=infinity

# 工作目录
WorkingDirectory=/etc/coredns

# 启动命令
ExecStart=/usr/local/bin/coredns -conf=/etc/coredns/Corefile
ExecReload=/bin/kill -SIGUSR1 $MAINPID

# 重启策略
Restart=on-failure
RestartSec=5
KillMode=mixed
KillSignal=SIGINT

# 日志配置
StandardOutput=journal
StandardError=journal
SyslogIdentifier=coredns

[Install]
WantedBy=multi-user.target
EOF

# 创建服务覆盖目录
sudo mkdir -p /etc/systemd/system/coredns.service.d

# 创建环境变量配置
sudo tee /etc/systemd/system/coredns.service.d/environment.conf > /dev/null << 'EOF'
[Service]
Environment="GOMAXPROCS=2"
Environment="GODEBUG=madvdontneed=1"
EOF

# 重载 systemd 配置
sudo systemctl daemon-reload

5. 日志轮转配置
#

# 创建 logrotate 配置
sudo tee /etc/logrotate.d/coredns > /dev/null << 'EOF'
/var/log/coredns/*.log {
    daily
    missingok
    rotate 30
    compress
    delaycompress
    notifempty
    create 644 coredns coredns
    postrotate
        /bin/systemctl reload coredns.service > /dev/null 2>&1 || true
    endscript
}
EOF

6. 启动和管理服务
#

# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable coredns.service
sudo systemctl start coredns.service

# 检查服务状态
sudo systemctl status coredns.service

# 查看日志
sudo journalctl -u coredns.service -f

# 重载配置
sudo systemctl reload coredns.service

# 重启服务
sudo systemctl restart coredns.service

容器化部署方式
#

Docker 部署
#

基础 Docker 部署
#

# 创建配置目录
mkdir -p ~/coredns/{config,zones,logs}

# 创建 Dockerfile
cat > ~/coredns/Dockerfile << 'EOF'
FROM coredns/coredns:1.11.1

# 添加自定义配置
COPY Corefile /etc/coredns/
COPY zones/ /etc/coredns/zones/

# 创建非 root 用户
RUN adduser -D -s /bin/sh coredns

# 设置权限
RUN chown -R coredns:coredns /etc/coredns

USER coredns

EXPOSE 53/udp 53/tcp 8080/tcp 9153/tcp

ENTRYPOINT ["/coredns"]
CMD ["-conf", "/etc/coredns/Corefile"]
EOF

# 创建配置文件
cat > ~/coredns/config/Corefile << 'EOF'
.:53 {
    bind 0.0.0.0

    hosts {
        ttl 60
        reload 1m
        fallthrough
    }

    forward . 8.8.8.8 8.8.4.4 {
        max_fails 3
        expire 10s
        health_check 5s
    }

    cache {
        success 65536 3600 300
        denial 8192 600 60
        prefetch 1 60m 10%
    }

    reload 6s
    log
    errors
    health :8080
    prometheus :9153
}
EOF

# 构建镜像
cd ~/coredns
docker build -t custom-coredns:latest .

# 运行容器
docker run -d \
    --name coredns \
    --restart unless-stopped \
    -p 53:53/udp \
    -p 53:53/tcp \
    -p 8080:8080/tcp \
    -p 9153:9153/tcp \
    -v $(pwd)/config:/etc/coredns:ro \
    -v $(pwd)/logs:/var/log/coredns \
    custom-coredns:latest

Docker Compose 部署
#

# docker-compose.yml
version: '3.8'

services:
  coredns:
    image: coredns/coredns:1.11.1
    container_name: coredns
    restart: unless-stopped
    ports:
      - "53:53/udp"
      - "53:53/tcp"
      - "8080:8080/tcp"  # Health check
      - "9153:9153/tcp"  # Metrics
    volumes:
      - ./config/Corefile:/etc/coredns/Corefile:ro
      - ./config/zones:/etc/coredns/zones:ro
      - ./logs:/var/log/coredns
    command: ["-conf", "/etc/coredns/Corefile"]
    networks:
      - dns-network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"

  # 可选:添加 Prometheus 监控
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    networks:
      - dns-network

  # 可选:添加 Grafana 可视化
  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
    volumes:
      - grafana-data:/var/lib/grafana
      - ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
      - ./monitoring/grafana/datasources:/etc/grafana/provisioning/datasources:ro
    networks:
      - dns-network

networks:
  dns-network:
    driver: bridge

volumes:
  prometheus-data:
  grafana-data:

Kubernetes 部署
#

基础 Kubernetes 部署
#

# coredns-namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: coredns-system
  labels:
    name: coredns-system

---
# coredns-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: coredns-config
  namespace: coredns-system
data:
  Corefile: |
    .:53 {
        bind 0.0.0.0

        kubernetes cluster.local in-addr.arpa ip6.arpa {
            pods insecure
            fallthrough in-addr.arpa ip6.arpa
            ttl 30
        }

        hosts {
            ttl 60
            reload 1m
            fallthrough
        }

        forward . 8.8.8.8 8.8.4.4 {
            max_fails 3
            expire 10s
            health_check 5s
            policy sequential
        }

        cache {
            success 65536 3600 300
            denial 8192 600 60
            prefetch 1 60m 10%
        }

        reload 6s
        log
        errors
        health :8080
        prometheus :9153
    }

---
# coredns-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: coredns
  namespace: coredns-system
  labels:
    app: coredns
spec:
  replicas: 2
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
      maxSurge: 1
  selector:
    matchLabels:
      app: coredns
  template:
    metadata:
      labels:
        app: coredns
    spec:
      serviceAccountName: coredns
      containers:
      - name: coredns
        image: coredns/coredns:1.11.1
        imagePullPolicy: IfNotPresent
        args: ["-conf", "/etc/coredns/Corefile"]
        ports:
        - containerPort: 53
          name: dns
          protocol: UDP
        - containerPort: 53
          name: dns-tcp
          protocol: TCP
        - containerPort: 8080
          name: health
          protocol: TCP
        - containerPort: 9153
          name: metrics
          protocol: TCP
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 60
          timeoutSeconds: 5
          successThreshold: 1
          failureThreshold: 5
        readinessProbe:
          httpGet:
            path: /ready
            port: 8181
          initialDelaySeconds: 10
          timeoutSeconds: 5
          successThreshold: 1
          failureThreshold: 3
        resources:
          limits:
            memory: 170Mi
            cpu: 100m
          requests:
            memory: 70Mi
            cpu: 50m
        volumeMounts:
        - name: config-volume
          mountPath: /etc/coredns
          readOnly: true
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            add:
            - NET_BIND_SERVICE
            drop:
            - ALL
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 1000
      volumes:
      - name: config-volume
        configMap:
          name: coredns-config
          items:
          - key: Corefile
            path: Corefile
      dnsPolicy: Default

---
# coredns-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: coredns
  namespace: coredns-system
  labels:
    app: coredns
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "9153"
spec:
  type: ClusterIP
  clusterIP: 10.96.0.10  # 固定 ClusterIP
  ports:
  - name: dns
    port: 53
    targetPort: 53
    protocol: UDP
  - name: dns-tcp
    port: 53
    targetPort: 53
    protocol: TCP
  - name: metrics
    port: 9153
    targetPort: 9153
    protocol: TCP
  selector:
    app: coredns

---
# coredns-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: coredns
  namespace: coredns-system

---
# coredns-clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: coredns
rules:
- apiGroups:
  - ""
  resources:
  - endpoints
  - services
  - pods
  - namespaces
  verbs:
  - list
  - watch
- apiGroups:
  - discovery.k8s.io
  resources:
  - endpointslices
  verbs:
  - list
  - watch

---
# coredns-clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: coredns
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: coredns
subjects:
- kind: ServiceAccount
  name: coredns
  namespace: coredns-system

高可用配置
#

主从架构部署
#

主服务器配置 (192.168.1.10)
#

# 主服务器 Corefile
cat > /etc/coredns/Corefile << 'EOF'
.:53 {
    bind 0.0.0.0

    # 区域传输配置
    transfer {
        to 192.168.1.11  # 从服务器 IP
    }

    # 主区域文件
    file /etc/coredns/zones/local.lan.zone local.lan {
        transfer to 192.168.1.11
    }

    hosts {
        ttl 60
        reload 1m
        fallthrough
    }

    forward . 8.8.8.8 8.8.4.4 {
        max_fails 3
        expire 10s
        health_check 5s
        policy sequential
    }

    cache {
        success 65536 3600 300
        denial 8192 600 60
        prefetch 1 60m 10%
    }

    reload 6s
    log
    errors
    health :8080
    prometheus :9153
}
EOF

从服务器配置 (192.168.1.11)
#

# 从服务器 Corefile
cat > /etc/coredns/Corefile << 'EOF'
.:53 {
    bind 0.0.0.0

    # 从区域配置
    secondary local.lan {
        transfer from 192.168.1.10
        transfer to *
    }

    hosts {
        ttl 60
        reload 1m
        fallthrough
    }

    forward . 8.8.8.8 8.8.4.4 {
        max_fails 3
        expire 10s
        health_check 5s
        policy sequential
    }

    cache {
        success 65536 3600 300
        denial 8192 600 60
        prefetch 1 60m 10%
    }

    reload 6s
    log
    errors
    health :8080
    prometheus :9153
}
EOF

负载均衡配置
#

HAProxy 配置
#

# 安装 HAProxy
sudo apt update && sudo apt install -y haproxy

# 配置 HAProxy
sudo tee /etc/haproxy/haproxy.cfg > /dev/null << 'EOF'
global
    daemon
    chroot /var/lib/haproxy
    stats socket /run/haproxy/admin.sock mode 660 level admin
    stats timeout 30s
    user haproxy
    group haproxy

defaults
    mode tcp
    timeout connect 5000ms
    timeout client 50000ms
    timeout server 50000ms
    option tcplog

# DNS 负载均衡
frontend dns_frontend
    bind *:53
    mode tcp
    default_backend dns_backend

backend dns_backend
    mode tcp
    balance roundrobin
    option tcp-check
    tcp-check connect port 8080
    tcp-check expect string "OK"

    server dns1 192.168.1.10:53 check port 8080 inter 5s rise 2 fall 3
    server dns2 192.168.1.11:53 check port 8080 inter 5s rise 2 fall 3

# 统计页面
frontend stats
    bind *:8404
    mode http
    stats enable
    stats uri /stats
    stats refresh 30s
    stats admin if TRUE
EOF

# 启动 HAProxy
sudo systemctl enable haproxy
sudo systemctl start haproxy

Keepalived 高可用
#

# 主服务器 Keepalived 配置
sudo tee /etc/keepalived/keepalived.conf > /dev/null << 'EOF'
vrrp_script chk_coredns {
    script "/usr/local/bin/check_coredns.sh"
    interval 2
    weight -2
    fall 3
    rise 2
}

vrrp_instance VI_1 {
    state MASTER
    interface eth0
    virtual_router_id 51
    priority 110
    advert_int 1
    authentication {
        auth_type PASS
        auth_pass coredns123
    }
    virtual_ipaddress {
        192.168.1.100/24
    }
    track_script {
        chk_coredns
    }
}
EOF

# 创建健康检查脚本
sudo tee /usr/local/bin/check_coredns.sh > /dev/null << 'EOF'
#!/bin/bash
curl -f http://localhost:8080/health > /dev/null 2>&1
exit $?
EOF

sudo chmod +x /usr/local/bin/check_coredns.sh

# 从服务器配置(priority 改为 100)
# 启动 Keepalived
sudo systemctl enable keepalived
sudo systemctl start keepalived

监控和告警
#

Prometheus 监控配置
#

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "coredns_rules.yml"

scrape_configs:
  - job_name: 'coredns'
    static_configs:
      - targets: ['192.168.1.10:9153', '192.168.1.11:9153']
    scrape_interval: 15s
    metrics_path: /metrics

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

告警规则配置
#

# coredns_rules.yml
groups:
- name: coredns
  rules:
  - alert: CoreDNSDown
    expr: up{job="coredns"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "CoreDNS instance is down"
      description: "CoreDNS instance {{ $labels.instance }} has been down for more than 5 minutes."

  - alert: CoreDNSHighQueryRate
    expr: rate(coredns_dns_requests_total[5m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "High DNS query rate"
      description: "CoreDNS instance {{ $labels.instance }} is receiving {{ $value }} queries per second."

  - alert: CoreDNSHighErrorRate
    expr: rate(coredns_dns_responses_total{rcode!="NOERROR"}[5m]) / rate(coredns_dns_responses_total[5m]) > 0.1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High DNS error rate"
      description: "CoreDNS instance {{ $labels.instance }} has error rate of {{ $value | humanizePercentage }}."

  - alert: CoreDNSCacheHitRateLow
    expr: rate(coredns_cache_hits_total[5m]) / (rate(coredns_cache_hits_total[5m]) + rate(coredns_cache_misses_total[5m])) < 0.8
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Low DNS cache hit rate"
      description: "CoreDNS instance {{ $labels.instance }} has cache hit rate of {{ $value | humanizePercentage }}."

Grafana 仪表板
#

{
  "dashboard": {
    "id": null,
    "title": "CoreDNS Dashboard",
    "tags": ["coredns", "dns"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "DNS Queries per Second",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(coredns_dns_requests_total[5m])",
            "legendFormat": "{{ instance }}"
          }
        ]
      },
      {
        "id": 2,
        "title": "DNS Response Codes",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(coredns_dns_responses_total[5m])",
            "legendFormat": "{{ rcode }}"
          }
        ]
      },
      {
        "id": 3,
        "title": "Cache Hit Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "rate(coredns_cache_hits_total[5m]) / (rate(coredns_cache_hits_total[5m]) + rate(coredns_cache_misses_total[5m]))"
          }
        ]
      }
    ]
  }
}

管理和维护
#

日常管理脚本
#

#!/bin/bash
# CoreDNS 管理脚本

COREDNS_CONFIG="/etc/coredns/Corefile"
HOSTS_FILE="/etc/coredns/hosts"
ZONES_DIR="/etc/coredns/zones"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 添加主机记录
add_host() {
    local ip="$1"
    local hostname="$2"

    if [[ -z "$ip" || -z "$hostname" ]]; then
        log_error "Usage: add_host <ip> <hostname>"
        return 1
    fi

    # 检查 IP 格式
    if ! [[ "$ip" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
        log_error "Invalid IP address format: $ip"
        return 1
    fi

    # 检查是否已存在
    if grep -q "$hostname" "$HOSTS_FILE" 2>/dev/null; then
        log_warn "Hostname $hostname already exists"
        return 1
    fi

    # 添加记录
    echo "$ip    $hostname" >> "$HOSTS_FILE"
    log_info "Added host record: $ip -> $hostname"

    # 重载配置
    reload_config
}

# 删除主机记录
remove_host() {
    local hostname="$1"

    if [[ -z "$hostname" ]]; then
        log_error "Usage: remove_host <hostname>"
        return 1
    fi

    # 删除记录
    if sed -i "/$hostname/d" "$HOSTS_FILE"; then
        log_info "Removed host record: $hostname"
        reload_config
    else
        log_error "Failed to remove host record: $hostname"
        return 1
    fi
}

# 列出主机记录
list_hosts() {
    log_info "Current host records:"
    if [[ -f "$HOSTS_FILE" ]]; then
        cat "$HOSTS_FILE" | grep -v '^#' | grep -v '^$'
    else
        log_warn "Hosts file not found: $HOSTS_FILE"
    fi
}

# 重载配置
reload_config() {
    if systemctl is-active --quiet coredns; then
        if systemctl reload coredns; then
            log_info "CoreDNS configuration reloaded"
        else
            log_error "Failed to reload CoreDNS configuration"
            return 1
        fi
    else
        log_warn "CoreDNS service is not running"
    fi
}

# 检查服务状态
check_status() {
    log_info "CoreDNS Service Status:"
    systemctl status coredns --no-pager

    log_info "Health Check:"
    if curl -f http://localhost:8080/health >/dev/null 2>&1; then
        log_info "Health check: PASSED"
    else
        log_error "Health check: FAILED"
    fi

    log_info "DNS Query Test:"
    if dig @localhost google.com >/dev/null 2>&1; then
        log_info "DNS query test: PASSED"
    else
        log_error "DNS query test: FAILED"
    fi
}

# 查看日志
view_logs() {
    local lines="${1:-50}"
    log_info "CoreDNS Logs (last $lines lines):"
    journalctl -u coredns -n "$lines" --no-pager
}

# 备份配置
backup_config() {
    local backup_dir="/var/backups/coredns"
    local timestamp=$(date +%Y%m%d_%H%M%S)

    mkdir -p "$backup_dir"

    tar -czf "$backup_dir/coredns_config_$timestamp.tar.gz" \
        -C /etc coredns/

    log_info "Configuration backed up to: $backup_dir/coredns_config_$timestamp.tar.gz"
}

# 性能统计
show_stats() {
    log_info "CoreDNS Performance Statistics:"

    if command -v curl >/dev/null 2>&1; then
        echo "Prometheus Metrics:"
        curl -s http://localhost:9153/metrics | grep -E "(coredns_dns_requests_total|coredns_dns_responses_total|coredns_cache_hits_total)"
    else
        log_warn "curl not found, cannot fetch metrics"
    fi
}

# 主函数
main() {
    case "${1:-}" in
        "add-host")
            add_host "$2" "$3"
            ;;
        "remove-host")
            remove_host "$2"
            ;;
        "list-hosts")
            list_hosts
            ;;
        "reload")
            reload_config
            ;;
        "status")
            check_status
            ;;
        "logs")
            view_logs "$2"
            ;;
        "backup")
            backup_config
            ;;
        "stats")
            show_stats
            ;;
        *)
            echo "Usage: $0 {add-host|remove-host|list-hosts|reload|status|logs|backup|stats}"
            echo ""
            echo "Commands:"
            echo "  add-host <ip> <hostname>  - Add a host record"
            echo "  remove-host <hostname>    - Remove a host record"
            echo "  list-hosts               - List all host records"
            echo "  reload                   - Reload CoreDNS configuration"
            echo "  status                   - Check CoreDNS status"
            echo "  logs [lines]             - View CoreDNS logs"
            echo "  backup                   - Backup configuration"
            echo "  stats                    - Show performance statistics"
            exit 1
            ;;
    esac
}

main "$@"

自动化运维脚本
#

#!/bin/bash
# CoreDNS 自动化运维脚本

# 健康检查和自动恢复
health_check_and_recovery() {
    local max_retries=3
    local retry_count=0

    while [[ $retry_count -lt $max_retries ]]; do
        if curl -f http://localhost:8080/health >/dev/null 2>&1; then
            log_info "Health check passed"
            return 0
        else
            log_warn "Health check failed, attempt $((retry_count + 1))/$max_retries"

            # 尝试重启服务
            systemctl restart coredns
            sleep 10

            ((retry_count++))
        fi
    done

    log_error "Health check failed after $max_retries attempts"
    # 发送告警
    send_alert "CoreDNS health check failed after $max_retries attempts"
    return 1
}

# 性能监控
performance_monitor() {
    local cpu_usage=$(ps -o %cpu -p $(pgrep coredns) --no-headers | awk '{sum+=$1} END {print sum}')
    local memory_usage=$(ps -o %mem -p $(pgrep coredns) --no-headers | awk '{sum+=$1} END {print sum}')

    log_info "CoreDNS Performance: CPU: ${cpu_usage}%, Memory: ${memory_usage}%"

    # 检查性能阈值
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        log_warn "High CPU usage detected: ${cpu_usage}%"
        send_alert "CoreDNS high CPU usage: ${cpu_usage}%"
    fi

    if (( $(echo "$memory_usage > 80" | bc -l) )); then
        log_warn "High memory usage detected: ${memory_usage}%"
        send_alert "CoreDNS high memory usage: ${memory_usage}%"
    fi
}

# 发送告警
send_alert() {
    local message="$1"
    local webhook_url="YOUR_WEBHOOK_URL"

    if [[ -n "$webhook_url" ]]; then
        curl -X POST -H 'Content-type: application/json' \
            --data "{\"text\":\"🚨 CoreDNS Alert: $message\"}" \
            "$webhook_url"
    fi

    # 记录到系统日志
    logger -t coredns-monitor "$message"
}

# 定时任务配置
setup_cron_jobs() {
    # 添加 cron 任务
    (crontab -l 2>/dev/null; echo "*/5 * * * * /usr/local/bin/coredns-manage.sh health-check") | crontab -
    (crontab -l 2>/dev/null; echo "0 2 * * * /usr/local/bin/coredns-manage.sh backup") | crontab -
    (crontab -l 2>/dev/null; echo "*/10 * * * * /usr/local/bin/coredns-manage.sh performance") | crontab -

    log_info "Cron jobs configured for automated monitoring"
}

测试和验证
#

功能测试
#

#!/bin/bash
# CoreDNS 功能测试脚本

# 测试配置
DNS_SERVER="192.168.1.10"
TEST_DOMAINS=("google.com" "github.com" "local.lan")
INTERNAL_DOMAINS=("server1.local.lan" "api.internal.local")

# 基础 DNS 解析测试
test_dns_resolution() {
    log_info "Testing DNS resolution..."

    for domain in "${TEST_DOMAINS[@]}"; do
        if dig @"$DNS_SERVER" "$domain" +short >/dev/null 2>&1; then
            log_info "✓ DNS resolution test passed for $domain"
        else
            log_error "✗ DNS resolution test failed for $domain"
            return 1
        fi
    done
}

# 内网域名测试
test_internal_domains() {
    log_info "Testing internal domain resolution..."

    for domain in "${INTERNAL_DOMAINS[@]}"; do
        if dig @"$DNS_SERVER" "$domain" +short >/dev/null 2>&1; then
            log_info "✓ Internal domain test passed for $domain"
        else
            log_warn "⚠ Internal domain test failed for $domain (may be expected)"
        fi
    done
}

# 性能测试
test_performance() {
    log_info "Running performance test..."

    # 使用 dnsperf 进行性能测试(如果可用)
    if command -v dnsperf >/dev/null 2>&1; then
        echo "google.com A" > /tmp/test_queries.txt
        echo "github.com A" >> /tmp/test_queries.txt
        echo "stackoverflow.com A" >> /tmp/test_queries.txt

        dnsperf -s "$DNS_SERVER" -d /tmp/test_queries.txt -l 10 -c 10
        rm -f /tmp/test_queries.txt
    else
        log_warn "dnsperf not available, skipping performance test"
    fi
}

# 缓存测试
test_cache() {
    log_info "Testing DNS cache..."

    # 第一次查询
    time1=$(dig @"$DNS_SERVER" google.com | grep "Query time" | awk '{print $4}')

    # 第二次查询(应该从缓存返回)
    time2=$(dig @"$DNS_SERVER" google.com | grep "Query time" | awk '{print $4}')

    log_info "First query: ${time1}ms, Second query: ${time2}ms"

    if [[ "$time2" -lt "$time1" ]]; then
        log_info "✓ Cache test passed (second query faster)"
    else
        log_warn "⚠ Cache test inconclusive"
    fi
}

# 负载测试
test_load() {
    log_info "Running load test..."

    # 并发查询测试
    for i in {1..50}; do
        dig @"$DNS_SERVER" "test$i.google.com" >/dev/null 2>&1 &
    done

    wait
    log_info "✓ Load test completed (50 concurrent queries)"
}

# 运行所有测试
run_all_tests() {
    log_info "Starting CoreDNS comprehensive tests..."

    test_dns_resolution
    test_internal_domains
    test_cache
    test_load
    test_performance

    log_info "All tests completed!"
}

# 执行测试
run_all_tests

故障排除指南
#

常见问题和解决方案
#

# 1. 服务启动失败
troubleshoot_startup() {
    log_info "Troubleshooting startup issues..."

    # 检查配置文件语法
    if /usr/local/bin/coredns -conf=/etc/coredns/Corefile -validate; then
        log_info "✓ Configuration syntax is valid"
    else
        log_error "✗ Configuration syntax error"
        return 1
    fi

    # 检查端口占用
    if netstat -tulpn | grep :53 >/dev/null 2>&1; then
        log_warn "Port 53 is already in use:"
        netstat -tulpn | grep :53
    fi

    # 检查权限
    if [[ -r /etc/coredns/Corefile ]]; then
        log_info "✓ Configuration file is readable"
    else
        log_error "✗ Configuration file permission issue"
    fi
}

# 2. DNS 解析失败
troubleshoot_resolution() {
    log_info "Troubleshooting DNS resolution..."

    # 检查上游 DNS
    for upstream in 8.8.8.8 8.8.4.4 1.1.1.1; do
        if dig @"$upstream" google.com +short >/dev/null 2>&1; then
            log_info "✓ Upstream DNS $upstream is reachable"
        else
            log_error "✗ Upstream DNS $upstream is unreachable"
        fi
    done

    # 检查本地解析
    if dig @127.0.0.1 google.com +short >/dev/null 2>&1; then
        log_info "✓ Local DNS resolution working"
    else
        log_error "✗ Local DNS resolution failed"
    fi
}

# 3. 性能问题
troubleshoot_performance() {
    log_info "Troubleshooting performance issues..."

    # 检查系统资源
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
    local memory_usage=$(free | grep Mem | awk '{printf("%.1f"), $3/$2 * 100.0}')

    log_info "System CPU usage: ${cpu_usage}%"
    log_info "System memory usage: ${memory_usage}%"

    # 检查 CoreDNS 进程
    local coredns_pid=$(pgrep coredns)
    if [[ -n "$coredns_pid" ]]; then
        local coredns_cpu=$(ps -o %cpu -p "$coredns_pid" --no-headers)
        local coredns_mem=$(ps -o %mem -p "$coredns_pid" --no-headers)
        log_info "CoreDNS CPU usage: ${coredns_cpu}%"
        log_info "CoreDNS memory usage: ${coredns_mem}%"
    fi
}

# 4. 网络连接问题
troubleshoot_network() {
    log_info "Troubleshooting network connectivity..."

    # 检查监听端口
    if netstat -tulpn | grep ":53.*coredns" >/dev/null 2>&1; then
        log_info "✓ CoreDNS is listening on port 53"
    else
        log_error "✗ CoreDNS is not listening on port 53"
    fi

    # 检查防火墙
    if command -v ufw >/dev/null 2>&1; then
        if ufw status | grep "53.*ALLOW" >/dev/null 2>&1; then
            log_info "✓ Firewall allows DNS traffic"
        else
            log_warn "⚠ Firewall may be blocking DNS traffic"
        fi
    fi
}

# 综合故障排除
comprehensive_troubleshoot() {
    log_info "Running comprehensive troubleshooting..."

    troubleshoot_startup
    troubleshoot_resolution
    troubleshoot_performance
    troubleshoot_network

    # 生成诊断报告
    generate_diagnostic_report
}

# 生成诊断报告
generate_diagnostic_report() {
    local report_file="/tmp/coredns_diagnostic_$(date +%Y%m%d_%H%M%S).txt"

    {
        echo "CoreDNS Diagnostic Report"
        echo "========================"
        echo "Generated: $(date)"
        echo ""

        echo "System Information:"
        uname -a
        echo ""

        echo "CoreDNS Version:"
        /usr/local/bin/coredns -version
        echo ""

        echo "Service Status:"
        systemctl status coredns --no-pager
        echo ""

        echo "Configuration:"
        cat /etc/coredns/Corefile
        echo ""

        echo "Recent Logs:"
        journalctl -u coredns -n 50 --no-pager
        echo ""

        echo "Network Status:"
        netstat -tulpn | grep :53
        echo ""

        echo "Process Information:"
        ps aux | grep coredns
        echo ""

    } > "$report_file"

    log_info "Diagnostic report generated: $report_file"
}

性能优化
#

配置优化
#

# 高性能 Corefile 配置
cat > /etc/coredns/Corefile.optimized << 'EOF'
.:53 {
    bind 0.0.0.0

    # 优化的缓存配置
    cache {
        success 65536 7200 300    # 增大缓存大小和 TTL
        denial 16384 1800 60      # 增大否定缓存
        prefetch 2 60m 20%        # 增强预取
        serve_stale               # 提供过期缓存
    }

    # 优化的转发配置
    forward . 8.8.8.8 8.8.4.4 1.1.1.1 1.0.0.1 {
        max_fails 2
        expire 5s
        health_check 3s
        policy sequential
        prefer_udp
        max_concurrent 1000
    }

    # 负载均衡
    loadbalance round_robin

    # 减少日志级别
    log {
        class error
    }

    # 错误处理优化
    errors {
        consolidate 5m ".* i/o timeout"
        consolidate 30s ".*"
    }

    # 健康检查
    health :8080 {
        lameduck 5s
    }

    # 监控
    prometheus :9153

    # 自动重载
    reload 30s  # 减少重载频率
}
EOF

系统优化
#

# 系统参数优化
cat > /etc/sysctl.d/99-coredns.conf << 'EOF'
# 网络优化
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
net.core.wmem_default = 262144
net.core.wmem_max = 16777216
net.core.netdev_max_backlog = 5000
net.ipv4.udp_mem = 102400 873800 16777216
net.ipv4.udp_rmem_min = 8192
net.ipv4.udp_wmem_min = 8192

# 文件描述符
fs.file-max = 1048576

# 进程限制
kernel.pid_max = 4194304
EOF

# 应用配置
sysctl -p /etc/sysctl.d/99-coredns.conf

# 用户限制优化
cat > /etc/security/limits.d/coredns.conf << 'EOF'
coredns soft nofile 1048576
coredns hard nofile 1048576
coredns soft nproc 1048576
coredns hard nproc 1048576
EOF

总结
#

部署方式对比
#

部署方式优势劣势适用场景
二进制部署性能最优、资源占用少、配置灵活管理复杂、更新麻烦生产环境、高性能要求
Docker 部署部署简单、环境隔离、易于管理性能略低、资源开销开发测试、快速部署
Kubernetes 部署高可用、自动扩缩容、服务发现复杂度高、资源要求高云原生环境、大规模集群

最佳实践总结
#

  1. 配置管理

    • 使用版本控制管理配置文件
    • 实施配置验证和测试
    • 建立配置变更流程
  2. 监控告警

    • 部署 Prometheus + Grafana 监控
    • 配置关键指标告警
    • 建立故障响应流程
  3. 高可用设计

    • 部署多实例负载均衡
    • 实施健康检查和自动故障转移
    • 定期备份配置和数据
  4. 性能优化

    • 合理配置缓存策略
    • 优化系统参数
    • 监控性能指标
  5. 安全加固

    • 实施访问控制
    • 定期更新版本
    • 监控安全事件

进阶学习资源
#

通过本指南的学习和实践,您将能够成功部署和管理企业级的 CoreDNS 服务,为您的基础设施提供稳定、高效的 DNS 解析服务。

相关文章

Coredns 出现间断性无法正常解析域名问题
·576 字·3 分钟
Kubernetes Coredns Dns
Kubernetes 集群控制平面组件频繁重启问题排查记录
·244 字·2 分钟
Kubernetes Coredns Flannel Fix
企业级 Jenkins CI/CD 平台部署与配置完整指南
·4484 字·22 分钟
DevOps Jenkins CI/CD Devops Automation Pipeline