CoreDNS 是现代化的 DNS 服务器,以其插件化架构、高性能和易配置性成为 Kubernetes 默认 DNS 解决方案。本指南将从基础部署到企业级应用,全面介绍 CoreDNS 的部署、配置、优化和管理。
CoreDNS 概述#
什么是 CoreDNS#
CoreDNS 是用 Go 语言编写的现代化 DNS 服务器,具有以下核心特性:
核心优势#
- 插件化架构: 基于 Caddy 框架,支持丰富的插件生态
- 云原生设计: Kubernetes 1.13+ 默认 DNS 服务器
- 高性能: Go 语言编写,支持高并发处理
- 配置简单: 使用 Corefile DSL 语法,易于理解和维护
- 服务发现: 支持多种服务发现机制
- 可观测性: 内置监控和日志功能
应用场景#
graph TD
A["CoreDNS 应用场景"] --> B["企业内网 DNS"]
A --> C["Kubernetes 集群 DNS"]
A --> D["服务发现"]
A --> E["负载均衡"]
A --> F["DNS 代理/转发"]
A --> G["广告拦截"]
A --> H["安全过滤"]
B --> B1["内网域名解析"]
B --> B2["主机名管理"]
B --> B3["服务注册"]
C --> C1["Pod 域名解析"]
C --> C2["Service 发现"]
C --> C3["跨命名空间通信"]
D --> D1["微服务注册"]
D --> D2["动态配置"]
D --> D3["健康检查"]
插件生态系统#
CoreDNS 的强大之处在于其丰富的插件系统:
核心插件#
| 插件类别 | 插件名称 | 功能描述 |
|---|---|---|
| 基础功能 | hosts | 静态主机记录 |
| file | 区域文件解析 | |
| forward | DNS 转发 | |
| cache | DNS 缓存 | |
| 服务发现 | kubernetes | K8s 服务发现 |
| etcd | etcd 服务发现 | |
| consul | Consul 服务发现 | |
| 负载均衡 | loadbalance | 负载均衡 |
| health | 健康检查 | |
| 监控日志 | log | 访问日志 |
| metrics | Prometheus 指标 | |
| trace | 链路追踪 | |
| 安全功能 | acl | 访问控制 |
| dnssec | DNSSEC 支持 | |
| blocklist | 域名黑名单 |
环境准备#
系统要求#
# 支持的操作系统
- Linux (推荐 Ubuntu 20.04+, CentOS 8+)
- Windows Server 2019+
- macOS 10.15+
# 硬件要求
- CPU: 1 核心以上
- 内存: 512MB 以上(推荐 2GB+)
- 磁盘: 100MB 以上
- 网络: 支持 UDP/TCP 53 端口
# 依赖软件
- systemd (Linux 服务管理)
- Docker (容器化部署)
- Kubernetes (集群部署)
网络规划#
# 示例环境配置
主服务器: 192.168.1.10 (主 DNS)
备用服务器: 192.168.1.11 (备 DNS)
管理网段: 192.168.1.0/24
服务网段: 10.0.0.0/16
二进制部署方式#
自动化安装脚本#
#!/bin/bash
# CoreDNS 自动化安装脚本
set -euo pipefail
# 配置变量
COREDNS_VERSION="1.11.1"
INSTALL_DIR="/usr/local/bin"
CONFIG_DIR="/etc/coredns"
DATA_DIR="/var/lib/coredns"
LOG_DIR="/var/log/coredns"
USER="coredns"
GROUP="coredns"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检测系统架构
detect_arch() {
local arch=$(uname -m)
case $arch in
x86_64)
echo "amd64"
;;
aarch64|arm64)
echo "arm64"
;;
armv7l)
echo "arm"
;;
*)
log_error "Unsupported architecture: $arch"
exit 1
;;
esac
}
# 检测操作系统
detect_os() {
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
echo "linux"
elif [[ "$OSTYPE" == "darwin"* ]]; then
echo "darwin"
else
log_error "Unsupported OS: $OSTYPE"
exit 1
fi
}
# 下载 CoreDNS
download_coredns() {
local os=$(detect_os)
local arch=$(detect_arch)
local download_url="https://github.com/coredns/coredns/releases/download/v${COREDNS_VERSION}/coredns_${COREDNS_VERSION}_${os}_${arch}.tgz"
log_info "Downloading CoreDNS v${COREDNS_VERSION} for ${os}/${arch}..."
# 创建临时目录
local temp_dir=$(mktemp -d)
cd "$temp_dir"
# 下载并解压
if command -v wget >/dev/null 2>&1; then
wget -q "$download_url" -O coredns.tgz
elif command -v curl >/dev/null 2>&1; then
curl -sL "$download_url" -o coredns.tgz
else
log_error "Neither wget nor curl found. Please install one of them."
exit 1
fi
tar -xzf coredns.tgz
# 安装二进制文件
sudo mv coredns "$INSTALL_DIR/"
sudo chmod +x "$INSTALL_DIR/coredns"
# 清理临时文件
cd /
rm -rf "$temp_dir"
log_info "CoreDNS binary installed to $INSTALL_DIR/coredns"
}
# 创建用户和目录
setup_user_and_dirs() {
log_info "Creating user and directories..."
# 创建用户
if ! id "$USER" >/dev/null 2>&1; then
sudo useradd --system --no-create-home --shell /sbin/nologin "$USER"
log_info "Created user: $USER"
fi
# 创建目录
sudo mkdir -p "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"
sudo chown "$USER:$GROUP" "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"
sudo chmod 755 "$CONFIG_DIR" "$DATA_DIR" "$LOG_DIR"
log_info "Created directories: $CONFIG_DIR, $DATA_DIR, $LOG_DIR"
}
# 生成配置文件
generate_config() {
log_info "Generating CoreDNS configuration..."
sudo tee "$CONFIG_DIR/Corefile" > /dev/null << 'EOF'
# CoreDNS 主配置文件
# 监听所有接口的 53 端口
.:53 {
# 绑定地址
bind 0.0.0.0
# 静态主机记录
hosts {
# 自定义主机记录
ttl 60
reload 1m
fallthrough
}
# DNS 转发
forward . 8.8.8.8 8.8.4.4 1.1.1.1 {
max_fails 3
expire 10s
health_check 5s
policy sequential
}
# DNS 缓存
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
}
# 自动重载配置
reload 6s
# 负载均衡
loadbalance round_robin
# 日志记录
log {
class error
}
# 错误处理
errors
# 健康检查
health :8080
# Prometheus 监控
prometheus :9153
}
# 内网域名解析
local.lan:53 {
bind 0.0.0.0
file /etc/coredns/zones/local.lan.zone
log {
class all
}
errors
}
EOF
sudo chown "$USER:$GROUP" "$CONFIG_DIR/Corefile"
log_info "Generated Corefile at $CONFIG_DIR/Corefile"
}
# 创建区域文件
create_zone_files() {
log_info "Creating zone files..."
sudo mkdir -p "$CONFIG_DIR/zones"
# 创建本地域名区域文件
sudo tee "$CONFIG_DIR/zones/local.lan.zone" > /dev/null << 'EOF'
$ORIGIN local.lan.
$TTL 300
@ IN SOA ns1.local.lan. admin.local.lan. (
2023120101 ; Serial
3600 ; Refresh
1800 ; Retry
604800 ; Expire
300 ; Minimum TTL
)
@ IN NS ns1.local.lan.
ns1 IN A 192.168.1.10
; 示例主机记录
router IN A 192.168.1.1
server1 IN A 192.168.1.10
server2 IN A 192.168.1.11
EOF
sudo chown -R "$USER:$GROUP" "$CONFIG_DIR/zones"
log_info "Created zone files in $CONFIG_DIR/zones/"
}
# 主函数
main() {
log_info "Starting CoreDNS installation..."
# 检查权限
if [[ $EUID -ne 0 ]] && ! sudo -n true 2>/dev/null; then
log_error "This script requires sudo privileges"
exit 1
fi
download_coredns
setup_user_and_dirs
generate_config
create_zone_files
log_info "CoreDNS installation completed!"
log_info "Next steps:"
log_info "1. Review configuration: $CONFIG_DIR/Corefile"
log_info "2. Create systemd service: systemctl enable coredns"
log_info "3. Start service: systemctl start coredns"
}
# 执行主函数
main "$@"
手动安装步骤#
1. 下载和安装二进制文件#
# 设置版本和架构
COREDNS_VERSION="1.11.1"
ARCH=$(uname -m)
# 根据架构设置下载链接
case $ARCH in
x86_64)
DOWNLOAD_ARCH="amd64"
;;
aarch64|arm64)
DOWNLOAD_ARCH="arm64"
;;
armv7l)
DOWNLOAD_ARCH="arm"
;;
*)
echo "Unsupported architecture: $ARCH"
exit 1
;;
esac
# 下载 CoreDNS
wget "https://github.com/coredns/coredns/releases/download/v${COREDNS_VERSION}/coredns_${COREDNS_VERSION}_linux_${DOWNLOAD_ARCH}.tgz"
# 解压和安装
tar -xzf "coredns_${COREDNS_VERSION}_linux_${DOWNLOAD_ARCH}.tgz"
sudo mv coredns /usr/local/bin/
sudo chmod +x /usr/local/bin/coredns
# 验证安装
/usr/local/bin/coredns -version
2. 创建用户和目录结构#
# 创建系统用户
sudo useradd --system --no-create-home --shell /sbin/nologin coredns
# 创建目录结构
sudo mkdir -p /etc/coredns/{zones,conf.d}
sudo mkdir -p /var/lib/coredns
sudo mkdir -p /var/log/coredns
# 设置权限
sudo chown -R coredns:coredns /etc/coredns /var/lib/coredns /var/log/coredns
sudo chmod 755 /etc/coredns /var/lib/coredns /var/log/coredns
3. 高级配置文件#
# 创建主配置文件
sudo tee /etc/coredns/Corefile > /dev/null << 'EOF'
# CoreDNS 主配置文件
# 全局配置块
.:53 {
# 绑定地址和端口
bind 0.0.0.0
# 静态主机记录
hosts /etc/coredns/hosts {
ttl 60
reload 1m
fallthrough
}
# 区域文件
file /etc/coredns/zones/local.lan.zone local.lan
# DNS 转发配置
forward . 8.8.8.8 8.8.4.4 1.1.1.1 {
max_fails 3
expire 10s
health_check 5s
policy sequential
prefer_udp
}
# 缓存配置
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
serve_stale
}
# 负载均衡
loadbalance round_robin
# 自动重载
reload 6s
# 日志配置
log {
class error
file /var/log/coredns/error.log
}
# 错误处理
errors {
consolidate 5m ".* i/o timeout"
consolidate 30s ".*"
}
# 健康检查端点
health :8080 {
lameduck 5s
}
# Prometheus 监控
prometheus :9153
# 访问控制
acl {
allow net 192.168.0.0/16
allow net 10.0.0.0/8
allow net 172.16.0.0/12
block net 0.0.0.0/0
}
}
# 内网域名配置
internal.local:53 {
bind 0.0.0.0
file /etc/coredns/zones/internal.local.zone
log {
class all
file /var/log/coredns/internal.log
}
errors
}
# 反向解析配置
1.168.192.in-addr.arpa:53 {
bind 0.0.0.0
file /etc/coredns/zones/192.168.1.rev
log {
class all
file /var/log/coredns/reverse.log
}
errors
}
EOF
# 创建主机记录文件
sudo tee /etc/coredns/hosts > /dev/null << 'EOF'
# 静态主机记录
192.168.1.1 router.local gateway.local
192.168.1.10 dns1.local ns1.local
192.168.1.11 dns2.local ns2.local
192.168.1.20 web.local www.local
192.168.1.21 api.local
192.168.1.22 db.local database.local
EOF
# 设置权限
sudo chown coredns:coredns /etc/coredns/Corefile /etc/coredns/hosts
4. 创建 systemd 服务#
# 创建现代化的 systemd 服务文件
sudo tee /etc/systemd/system/coredns.service > /dev/null << 'EOF'
[Unit]
Description=CoreDNS DNS Server
Documentation=https://coredns.io/manual/toc/
After=network-online.target
Wants=network-online.target
AssertFileIsExecutable=/usr/local/bin/coredns
[Service]
Type=simple
User=coredns
Group=coredns
# 安全配置
NoNewPrivileges=true
PrivateTmp=true
PrivateDevices=true
ProtectHome=true
ProtectSystem=strict
ReadWritePaths=/var/lib/coredns /var/log/coredns /etc/coredns
# 能力配置
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_SETGID CAP_SETUID
AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_SETGID CAP_SETUID
# 资源限制
LimitNOFILE=1048576
LimitNPROC=1048576
LimitCORE=infinity
# 工作目录
WorkingDirectory=/etc/coredns
# 启动命令
ExecStart=/usr/local/bin/coredns -conf=/etc/coredns/Corefile
ExecReload=/bin/kill -SIGUSR1 $MAINPID
# 重启策略
Restart=on-failure
RestartSec=5
KillMode=mixed
KillSignal=SIGINT
# 日志配置
StandardOutput=journal
StandardError=journal
SyslogIdentifier=coredns
[Install]
WantedBy=multi-user.target
EOF
# 创建服务覆盖目录
sudo mkdir -p /etc/systemd/system/coredns.service.d
# 创建环境变量配置
sudo tee /etc/systemd/system/coredns.service.d/environment.conf > /dev/null << 'EOF'
[Service]
Environment="GOMAXPROCS=2"
Environment="GODEBUG=madvdontneed=1"
EOF
# 重载 systemd 配置
sudo systemctl daemon-reload
5. 日志轮转配置#
# 创建 logrotate 配置
sudo tee /etc/logrotate.d/coredns > /dev/null << 'EOF'
/var/log/coredns/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 644 coredns coredns
postrotate
/bin/systemctl reload coredns.service > /dev/null 2>&1 || true
endscript
}
EOF
6. 启动和管理服务#
# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable coredns.service
sudo systemctl start coredns.service
# 检查服务状态
sudo systemctl status coredns.service
# 查看日志
sudo journalctl -u coredns.service -f
# 重载配置
sudo systemctl reload coredns.service
# 重启服务
sudo systemctl restart coredns.service
容器化部署方式#
Docker 部署#
基础 Docker 部署#
# 创建配置目录
mkdir -p ~/coredns/{config,zones,logs}
# 创建 Dockerfile
cat > ~/coredns/Dockerfile << 'EOF'
FROM coredns/coredns:1.11.1
# 添加自定义配置
COPY Corefile /etc/coredns/
COPY zones/ /etc/coredns/zones/
# 创建非 root 用户
RUN adduser -D -s /bin/sh coredns
# 设置权限
RUN chown -R coredns:coredns /etc/coredns
USER coredns
EXPOSE 53/udp 53/tcp 8080/tcp 9153/tcp
ENTRYPOINT ["/coredns"]
CMD ["-conf", "/etc/coredns/Corefile"]
EOF
# 创建配置文件
cat > ~/coredns/config/Corefile << 'EOF'
.:53 {
bind 0.0.0.0
hosts {
ttl 60
reload 1m
fallthrough
}
forward . 8.8.8.8 8.8.4.4 {
max_fails 3
expire 10s
health_check 5s
}
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
}
reload 6s
log
errors
health :8080
prometheus :9153
}
EOF
# 构建镜像
cd ~/coredns
docker build -t custom-coredns:latest .
# 运行容器
docker run -d \
--name coredns \
--restart unless-stopped \
-p 53:53/udp \
-p 53:53/tcp \
-p 8080:8080/tcp \
-p 9153:9153/tcp \
-v $(pwd)/config:/etc/coredns:ro \
-v $(pwd)/logs:/var/log/coredns \
custom-coredns:latest
Docker Compose 部署#
# docker-compose.yml
version: '3.8'
services:
coredns:
image: coredns/coredns:1.11.1
container_name: coredns
restart: unless-stopped
ports:
- "53:53/udp"
- "53:53/tcp"
- "8080:8080/tcp" # Health check
- "9153:9153/tcp" # Metrics
volumes:
- ./config/Corefile:/etc/coredns/Corefile:ro
- ./config/zones:/etc/coredns/zones:ro
- ./logs:/var/log/coredns
command: ["-conf", "/etc/coredns/Corefile"]
networks:
- dns-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# 可选:添加 Prometheus 监控
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
networks:
- dns-network
# 可选:添加 Grafana 可视化
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
volumes:
- grafana-data:/var/lib/grafana
- ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./monitoring/grafana/datasources:/etc/grafana/provisioning/datasources:ro
networks:
- dns-network
networks:
dns-network:
driver: bridge
volumes:
prometheus-data:
grafana-data:
Kubernetes 部署#
基础 Kubernetes 部署#
# coredns-namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: coredns-system
labels:
name: coredns-system
---
# coredns-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns-config
namespace: coredns-system
data:
Corefile: |
.:53 {
bind 0.0.0.0
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
hosts {
ttl 60
reload 1m
fallthrough
}
forward . 8.8.8.8 8.8.4.4 {
max_fails 3
expire 10s
health_check 5s
policy sequential
}
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
}
reload 6s
log
errors
health :8080
prometheus :9153
}
---
# coredns-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: coredns
namespace: coredns-system
labels:
app: coredns
spec:
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 1
selector:
matchLabels:
app: coredns
template:
metadata:
labels:
app: coredns
spec:
serviceAccountName: coredns
containers:
- name: coredns
image: coredns/coredns:1.11.1
imagePullPolicy: IfNotPresent
args: ["-conf", "/etc/coredns/Corefile"]
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
- containerPort: 8080
name: health
protocol: TCP
- containerPort: 9153
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
readinessProbe:
httpGet:
path: /ready
port: 8181
initialDelaySeconds: 10
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
resources:
limits:
memory: 170Mi
cpu: 100m
requests:
memory: 70Mi
cpu: 50m
volumeMounts:
- name: config-volume
mountPath: /etc/coredns
readOnly: true
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
volumes:
- name: config-volume
configMap:
name: coredns-config
items:
- key: Corefile
path: Corefile
dnsPolicy: Default
---
# coredns-service.yaml
apiVersion: v1
kind: Service
metadata:
name: coredns
namespace: coredns-system
labels:
app: coredns
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9153"
spec:
type: ClusterIP
clusterIP: 10.96.0.10 # 固定 ClusterIP
ports:
- name: dns
port: 53
targetPort: 53
protocol: UDP
- name: dns-tcp
port: 53
targetPort: 53
protocol: TCP
- name: metrics
port: 9153
targetPort: 9153
protocol: TCP
selector:
app: coredns
---
# coredns-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: coredns
namespace: coredns-system
---
# coredns-clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: coredns
rules:
- apiGroups:
- ""
resources:
- endpoints
- services
- pods
- namespaces
verbs:
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
---
# coredns-clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: coredns
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: coredns
subjects:
- kind: ServiceAccount
name: coredns
namespace: coredns-system
高可用配置#
主从架构部署#
主服务器配置 (192.168.1.10)#
# 主服务器 Corefile
cat > /etc/coredns/Corefile << 'EOF'
.:53 {
bind 0.0.0.0
# 区域传输配置
transfer {
to 192.168.1.11 # 从服务器 IP
}
# 主区域文件
file /etc/coredns/zones/local.lan.zone local.lan {
transfer to 192.168.1.11
}
hosts {
ttl 60
reload 1m
fallthrough
}
forward . 8.8.8.8 8.8.4.4 {
max_fails 3
expire 10s
health_check 5s
policy sequential
}
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
}
reload 6s
log
errors
health :8080
prometheus :9153
}
EOF
从服务器配置 (192.168.1.11)#
# 从服务器 Corefile
cat > /etc/coredns/Corefile << 'EOF'
.:53 {
bind 0.0.0.0
# 从区域配置
secondary local.lan {
transfer from 192.168.1.10
transfer to *
}
hosts {
ttl 60
reload 1m
fallthrough
}
forward . 8.8.8.8 8.8.4.4 {
max_fails 3
expire 10s
health_check 5s
policy sequential
}
cache {
success 65536 3600 300
denial 8192 600 60
prefetch 1 60m 10%
}
reload 6s
log
errors
health :8080
prometheus :9153
}
EOF
负载均衡配置#
HAProxy 配置#
# 安装 HAProxy
sudo apt update && sudo apt install -y haproxy
# 配置 HAProxy
sudo tee /etc/haproxy/haproxy.cfg > /dev/null << 'EOF'
global
daemon
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660 level admin
stats timeout 30s
user haproxy
group haproxy
defaults
mode tcp
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option tcplog
# DNS 负载均衡
frontend dns_frontend
bind *:53
mode tcp
default_backend dns_backend
backend dns_backend
mode tcp
balance roundrobin
option tcp-check
tcp-check connect port 8080
tcp-check expect string "OK"
server dns1 192.168.1.10:53 check port 8080 inter 5s rise 2 fall 3
server dns2 192.168.1.11:53 check port 8080 inter 5s rise 2 fall 3
# 统计页面
frontend stats
bind *:8404
mode http
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
EOF
# 启动 HAProxy
sudo systemctl enable haproxy
sudo systemctl start haproxy
Keepalived 高可用#
# 主服务器 Keepalived 配置
sudo tee /etc/keepalived/keepalived.conf > /dev/null << 'EOF'
vrrp_script chk_coredns {
script "/usr/local/bin/check_coredns.sh"
interval 2
weight -2
fall 3
rise 2
}
vrrp_instance VI_1 {
state MASTER
interface eth0
virtual_router_id 51
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass coredns123
}
virtual_ipaddress {
192.168.1.100/24
}
track_script {
chk_coredns
}
}
EOF
# 创建健康检查脚本
sudo tee /usr/local/bin/check_coredns.sh > /dev/null << 'EOF'
#!/bin/bash
curl -f http://localhost:8080/health > /dev/null 2>&1
exit $?
EOF
sudo chmod +x /usr/local/bin/check_coredns.sh
# 从服务器配置(priority 改为 100)
# 启动 Keepalived
sudo systemctl enable keepalived
sudo systemctl start keepalived
监控和告警#
Prometheus 监控配置#
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "coredns_rules.yml"
scrape_configs:
- job_name: 'coredns'
static_configs:
- targets: ['192.168.1.10:9153', '192.168.1.11:9153']
scrape_interval: 15s
metrics_path: /metrics
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
告警规则配置#
# coredns_rules.yml
groups:
- name: coredns
rules:
- alert: CoreDNSDown
expr: up{job="coredns"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "CoreDNS instance is down"
description: "CoreDNS instance {{ $labels.instance }} has been down for more than 5 minutes."
- alert: CoreDNSHighQueryRate
expr: rate(coredns_dns_requests_total[5m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "High DNS query rate"
description: "CoreDNS instance {{ $labels.instance }} is receiving {{ $value }} queries per second."
- alert: CoreDNSHighErrorRate
expr: rate(coredns_dns_responses_total{rcode!="NOERROR"}[5m]) / rate(coredns_dns_responses_total[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High DNS error rate"
description: "CoreDNS instance {{ $labels.instance }} has error rate of {{ $value | humanizePercentage }}."
- alert: CoreDNSCacheHitRateLow
expr: rate(coredns_cache_hits_total[5m]) / (rate(coredns_cache_hits_total[5m]) + rate(coredns_cache_misses_total[5m])) < 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "Low DNS cache hit rate"
description: "CoreDNS instance {{ $labels.instance }} has cache hit rate of {{ $value | humanizePercentage }}."
Grafana 仪表板#
{
"dashboard": {
"id": null,
"title": "CoreDNS Dashboard",
"tags": ["coredns", "dns"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "DNS Queries per Second",
"type": "graph",
"targets": [
{
"expr": "rate(coredns_dns_requests_total[5m])",
"legendFormat": "{{ instance }}"
}
]
},
{
"id": 2,
"title": "DNS Response Codes",
"type": "graph",
"targets": [
{
"expr": "rate(coredns_dns_responses_total[5m])",
"legendFormat": "{{ rcode }}"
}
]
},
{
"id": 3,
"title": "Cache Hit Rate",
"type": "singlestat",
"targets": [
{
"expr": "rate(coredns_cache_hits_total[5m]) / (rate(coredns_cache_hits_total[5m]) + rate(coredns_cache_misses_total[5m]))"
}
]
}
]
}
}
管理和维护#
日常管理脚本#
#!/bin/bash
# CoreDNS 管理脚本
COREDNS_CONFIG="/etc/coredns/Corefile"
HOSTS_FILE="/etc/coredns/hosts"
ZONES_DIR="/etc/coredns/zones"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 添加主机记录
add_host() {
local ip="$1"
local hostname="$2"
if [[ -z "$ip" || -z "$hostname" ]]; then
log_error "Usage: add_host <ip> <hostname>"
return 1
fi
# 检查 IP 格式
if ! [[ "$ip" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
log_error "Invalid IP address format: $ip"
return 1
fi
# 检查是否已存在
if grep -q "$hostname" "$HOSTS_FILE" 2>/dev/null; then
log_warn "Hostname $hostname already exists"
return 1
fi
# 添加记录
echo "$ip $hostname" >> "$HOSTS_FILE"
log_info "Added host record: $ip -> $hostname"
# 重载配置
reload_config
}
# 删除主机记录
remove_host() {
local hostname="$1"
if [[ -z "$hostname" ]]; then
log_error "Usage: remove_host <hostname>"
return 1
fi
# 删除记录
if sed -i "/$hostname/d" "$HOSTS_FILE"; then
log_info "Removed host record: $hostname"
reload_config
else
log_error "Failed to remove host record: $hostname"
return 1
fi
}
# 列出主机记录
list_hosts() {
log_info "Current host records:"
if [[ -f "$HOSTS_FILE" ]]; then
cat "$HOSTS_FILE" | grep -v '^#' | grep -v '^$'
else
log_warn "Hosts file not found: $HOSTS_FILE"
fi
}
# 重载配置
reload_config() {
if systemctl is-active --quiet coredns; then
if systemctl reload coredns; then
log_info "CoreDNS configuration reloaded"
else
log_error "Failed to reload CoreDNS configuration"
return 1
fi
else
log_warn "CoreDNS service is not running"
fi
}
# 检查服务状态
check_status() {
log_info "CoreDNS Service Status:"
systemctl status coredns --no-pager
log_info "Health Check:"
if curl -f http://localhost:8080/health >/dev/null 2>&1; then
log_info "Health check: PASSED"
else
log_error "Health check: FAILED"
fi
log_info "DNS Query Test:"
if dig @localhost google.com >/dev/null 2>&1; then
log_info "DNS query test: PASSED"
else
log_error "DNS query test: FAILED"
fi
}
# 查看日志
view_logs() {
local lines="${1:-50}"
log_info "CoreDNS Logs (last $lines lines):"
journalctl -u coredns -n "$lines" --no-pager
}
# 备份配置
backup_config() {
local backup_dir="/var/backups/coredns"
local timestamp=$(date +%Y%m%d_%H%M%S)
mkdir -p "$backup_dir"
tar -czf "$backup_dir/coredns_config_$timestamp.tar.gz" \
-C /etc coredns/
log_info "Configuration backed up to: $backup_dir/coredns_config_$timestamp.tar.gz"
}
# 性能统计
show_stats() {
log_info "CoreDNS Performance Statistics:"
if command -v curl >/dev/null 2>&1; then
echo "Prometheus Metrics:"
curl -s http://localhost:9153/metrics | grep -E "(coredns_dns_requests_total|coredns_dns_responses_total|coredns_cache_hits_total)"
else
log_warn "curl not found, cannot fetch metrics"
fi
}
# 主函数
main() {
case "${1:-}" in
"add-host")
add_host "$2" "$3"
;;
"remove-host")
remove_host "$2"
;;
"list-hosts")
list_hosts
;;
"reload")
reload_config
;;
"status")
check_status
;;
"logs")
view_logs "$2"
;;
"backup")
backup_config
;;
"stats")
show_stats
;;
*)
echo "Usage: $0 {add-host|remove-host|list-hosts|reload|status|logs|backup|stats}"
echo ""
echo "Commands:"
echo " add-host <ip> <hostname> - Add a host record"
echo " remove-host <hostname> - Remove a host record"
echo " list-hosts - List all host records"
echo " reload - Reload CoreDNS configuration"
echo " status - Check CoreDNS status"
echo " logs [lines] - View CoreDNS logs"
echo " backup - Backup configuration"
echo " stats - Show performance statistics"
exit 1
;;
esac
}
main "$@"
自动化运维脚本#
#!/bin/bash
# CoreDNS 自动化运维脚本
# 健康检查和自动恢复
health_check_and_recovery() {
local max_retries=3
local retry_count=0
while [[ $retry_count -lt $max_retries ]]; do
if curl -f http://localhost:8080/health >/dev/null 2>&1; then
log_info "Health check passed"
return 0
else
log_warn "Health check failed, attempt $((retry_count + 1))/$max_retries"
# 尝试重启服务
systemctl restart coredns
sleep 10
((retry_count++))
fi
done
log_error "Health check failed after $max_retries attempts"
# 发送告警
send_alert "CoreDNS health check failed after $max_retries attempts"
return 1
}
# 性能监控
performance_monitor() {
local cpu_usage=$(ps -o %cpu -p $(pgrep coredns) --no-headers | awk '{sum+=$1} END {print sum}')
local memory_usage=$(ps -o %mem -p $(pgrep coredns) --no-headers | awk '{sum+=$1} END {print sum}')
log_info "CoreDNS Performance: CPU: ${cpu_usage}%, Memory: ${memory_usage}%"
# 检查性能阈值
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
log_warn "High CPU usage detected: ${cpu_usage}%"
send_alert "CoreDNS high CPU usage: ${cpu_usage}%"
fi
if (( $(echo "$memory_usage > 80" | bc -l) )); then
log_warn "High memory usage detected: ${memory_usage}%"
send_alert "CoreDNS high memory usage: ${memory_usage}%"
fi
}
# 发送告警
send_alert() {
local message="$1"
local webhook_url="YOUR_WEBHOOK_URL"
if [[ -n "$webhook_url" ]]; then
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"🚨 CoreDNS Alert: $message\"}" \
"$webhook_url"
fi
# 记录到系统日志
logger -t coredns-monitor "$message"
}
# 定时任务配置
setup_cron_jobs() {
# 添加 cron 任务
(crontab -l 2>/dev/null; echo "*/5 * * * * /usr/local/bin/coredns-manage.sh health-check") | crontab -
(crontab -l 2>/dev/null; echo "0 2 * * * /usr/local/bin/coredns-manage.sh backup") | crontab -
(crontab -l 2>/dev/null; echo "*/10 * * * * /usr/local/bin/coredns-manage.sh performance") | crontab -
log_info "Cron jobs configured for automated monitoring"
}
测试和验证#
功能测试#
#!/bin/bash
# CoreDNS 功能测试脚本
# 测试配置
DNS_SERVER="192.168.1.10"
TEST_DOMAINS=("google.com" "github.com" "local.lan")
INTERNAL_DOMAINS=("server1.local.lan" "api.internal.local")
# 基础 DNS 解析测试
test_dns_resolution() {
log_info "Testing DNS resolution..."
for domain in "${TEST_DOMAINS[@]}"; do
if dig @"$DNS_SERVER" "$domain" +short >/dev/null 2>&1; then
log_info "✓ DNS resolution test passed for $domain"
else
log_error "✗ DNS resolution test failed for $domain"
return 1
fi
done
}
# 内网域名测试
test_internal_domains() {
log_info "Testing internal domain resolution..."
for domain in "${INTERNAL_DOMAINS[@]}"; do
if dig @"$DNS_SERVER" "$domain" +short >/dev/null 2>&1; then
log_info "✓ Internal domain test passed for $domain"
else
log_warn "⚠ Internal domain test failed for $domain (may be expected)"
fi
done
}
# 性能测试
test_performance() {
log_info "Running performance test..."
# 使用 dnsperf 进行性能测试(如果可用)
if command -v dnsperf >/dev/null 2>&1; then
echo "google.com A" > /tmp/test_queries.txt
echo "github.com A" >> /tmp/test_queries.txt
echo "stackoverflow.com A" >> /tmp/test_queries.txt
dnsperf -s "$DNS_SERVER" -d /tmp/test_queries.txt -l 10 -c 10
rm -f /tmp/test_queries.txt
else
log_warn "dnsperf not available, skipping performance test"
fi
}
# 缓存测试
test_cache() {
log_info "Testing DNS cache..."
# 第一次查询
time1=$(dig @"$DNS_SERVER" google.com | grep "Query time" | awk '{print $4}')
# 第二次查询(应该从缓存返回)
time2=$(dig @"$DNS_SERVER" google.com | grep "Query time" | awk '{print $4}')
log_info "First query: ${time1}ms, Second query: ${time2}ms"
if [[ "$time2" -lt "$time1" ]]; then
log_info "✓ Cache test passed (second query faster)"
else
log_warn "⚠ Cache test inconclusive"
fi
}
# 负载测试
test_load() {
log_info "Running load test..."
# 并发查询测试
for i in {1..50}; do
dig @"$DNS_SERVER" "test$i.google.com" >/dev/null 2>&1 &
done
wait
log_info "✓ Load test completed (50 concurrent queries)"
}
# 运行所有测试
run_all_tests() {
log_info "Starting CoreDNS comprehensive tests..."
test_dns_resolution
test_internal_domains
test_cache
test_load
test_performance
log_info "All tests completed!"
}
# 执行测试
run_all_tests
故障排除指南#
常见问题和解决方案#
# 1. 服务启动失败
troubleshoot_startup() {
log_info "Troubleshooting startup issues..."
# 检查配置文件语法
if /usr/local/bin/coredns -conf=/etc/coredns/Corefile -validate; then
log_info "✓ Configuration syntax is valid"
else
log_error "✗ Configuration syntax error"
return 1
fi
# 检查端口占用
if netstat -tulpn | grep :53 >/dev/null 2>&1; then
log_warn "Port 53 is already in use:"
netstat -tulpn | grep :53
fi
# 检查权限
if [[ -r /etc/coredns/Corefile ]]; then
log_info "✓ Configuration file is readable"
else
log_error "✗ Configuration file permission issue"
fi
}
# 2. DNS 解析失败
troubleshoot_resolution() {
log_info "Troubleshooting DNS resolution..."
# 检查上游 DNS
for upstream in 8.8.8.8 8.8.4.4 1.1.1.1; do
if dig @"$upstream" google.com +short >/dev/null 2>&1; then
log_info "✓ Upstream DNS $upstream is reachable"
else
log_error "✗ Upstream DNS $upstream is unreachable"
fi
done
# 检查本地解析
if dig @127.0.0.1 google.com +short >/dev/null 2>&1; then
log_info "✓ Local DNS resolution working"
else
log_error "✗ Local DNS resolution failed"
fi
}
# 3. 性能问题
troubleshoot_performance() {
log_info "Troubleshooting performance issues..."
# 检查系统资源
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
local memory_usage=$(free | grep Mem | awk '{printf("%.1f"), $3/$2 * 100.0}')
log_info "System CPU usage: ${cpu_usage}%"
log_info "System memory usage: ${memory_usage}%"
# 检查 CoreDNS 进程
local coredns_pid=$(pgrep coredns)
if [[ -n "$coredns_pid" ]]; then
local coredns_cpu=$(ps -o %cpu -p "$coredns_pid" --no-headers)
local coredns_mem=$(ps -o %mem -p "$coredns_pid" --no-headers)
log_info "CoreDNS CPU usage: ${coredns_cpu}%"
log_info "CoreDNS memory usage: ${coredns_mem}%"
fi
}
# 4. 网络连接问题
troubleshoot_network() {
log_info "Troubleshooting network connectivity..."
# 检查监听端口
if netstat -tulpn | grep ":53.*coredns" >/dev/null 2>&1; then
log_info "✓ CoreDNS is listening on port 53"
else
log_error "✗ CoreDNS is not listening on port 53"
fi
# 检查防火墙
if command -v ufw >/dev/null 2>&1; then
if ufw status | grep "53.*ALLOW" >/dev/null 2>&1; then
log_info "✓ Firewall allows DNS traffic"
else
log_warn "⚠ Firewall may be blocking DNS traffic"
fi
fi
}
# 综合故障排除
comprehensive_troubleshoot() {
log_info "Running comprehensive troubleshooting..."
troubleshoot_startup
troubleshoot_resolution
troubleshoot_performance
troubleshoot_network
# 生成诊断报告
generate_diagnostic_report
}
# 生成诊断报告
generate_diagnostic_report() {
local report_file="/tmp/coredns_diagnostic_$(date +%Y%m%d_%H%M%S).txt"
{
echo "CoreDNS Diagnostic Report"
echo "========================"
echo "Generated: $(date)"
echo ""
echo "System Information:"
uname -a
echo ""
echo "CoreDNS Version:"
/usr/local/bin/coredns -version
echo ""
echo "Service Status:"
systemctl status coredns --no-pager
echo ""
echo "Configuration:"
cat /etc/coredns/Corefile
echo ""
echo "Recent Logs:"
journalctl -u coredns -n 50 --no-pager
echo ""
echo "Network Status:"
netstat -tulpn | grep :53
echo ""
echo "Process Information:"
ps aux | grep coredns
echo ""
} > "$report_file"
log_info "Diagnostic report generated: $report_file"
}
性能优化#
配置优化#
# 高性能 Corefile 配置
cat > /etc/coredns/Corefile.optimized << 'EOF'
.:53 {
bind 0.0.0.0
# 优化的缓存配置
cache {
success 65536 7200 300 # 增大缓存大小和 TTL
denial 16384 1800 60 # 增大否定缓存
prefetch 2 60m 20% # 增强预取
serve_stale # 提供过期缓存
}
# 优化的转发配置
forward . 8.8.8.8 8.8.4.4 1.1.1.1 1.0.0.1 {
max_fails 2
expire 5s
health_check 3s
policy sequential
prefer_udp
max_concurrent 1000
}
# 负载均衡
loadbalance round_robin
# 减少日志级别
log {
class error
}
# 错误处理优化
errors {
consolidate 5m ".* i/o timeout"
consolidate 30s ".*"
}
# 健康检查
health :8080 {
lameduck 5s
}
# 监控
prometheus :9153
# 自动重载
reload 30s # 减少重载频率
}
EOF
系统优化#
# 系统参数优化
cat > /etc/sysctl.d/99-coredns.conf << 'EOF'
# 网络优化
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
net.core.wmem_default = 262144
net.core.wmem_max = 16777216
net.core.netdev_max_backlog = 5000
net.ipv4.udp_mem = 102400 873800 16777216
net.ipv4.udp_rmem_min = 8192
net.ipv4.udp_wmem_min = 8192
# 文件描述符
fs.file-max = 1048576
# 进程限制
kernel.pid_max = 4194304
EOF
# 应用配置
sysctl -p /etc/sysctl.d/99-coredns.conf
# 用户限制优化
cat > /etc/security/limits.d/coredns.conf << 'EOF'
coredns soft nofile 1048576
coredns hard nofile 1048576
coredns soft nproc 1048576
coredns hard nproc 1048576
EOF
总结#
部署方式对比#
| 部署方式 | 优势 | 劣势 | 适用场景 |
|---|---|---|---|
| 二进制部署 | 性能最优、资源占用少、配置灵活 | 管理复杂、更新麻烦 | 生产环境、高性能要求 |
| Docker 部署 | 部署简单、环境隔离、易于管理 | 性能略低、资源开销 | 开发测试、快速部署 |
| Kubernetes 部署 | 高可用、自动扩缩容、服务发现 | 复杂度高、资源要求高 | 云原生环境、大规模集群 |
最佳实践总结#
配置管理
- 使用版本控制管理配置文件
- 实施配置验证和测试
- 建立配置变更流程
监控告警
- 部署 Prometheus + Grafana 监控
- 配置关键指标告警
- 建立故障响应流程
高可用设计
- 部署多实例负载均衡
- 实施健康检查和自动故障转移
- 定期备份配置和数据
性能优化
- 合理配置缓存策略
- 优化系统参数
- 监控性能指标
安全加固
- 实施访问控制
- 定期更新版本
- 监控安全事件
进阶学习资源#
- 官方文档: CoreDNS Documentation
- 插件开发: CoreDNS Plugin Development
- 社区资源: CoreDNS GitHub
- 最佳实践: CNCF CoreDNS Best Practices
通过本指南的学习和实践,您将能够成功部署和管理企业级的 CoreDNS 服务,为您的基础设施提供稳定、高效的 DNS 解析服务。
