DNS is the foundation everything else depends on. A misconfigured record can take down your entire infrastructure. Yet DNS is often managed through web consoles with no version control, no review process, and no automation. Let’s fix that.

Terraform for DNS

Route53 Basics

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# dns.tf
resource "aws_route53_zone" "main" {
  name = "example.com"
  
  tags = {
    Environment = "production"
  }
}

# A record
resource "aws_route53_record" "www" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "www.example.com"
  type    = "A"
  ttl     = 300
  records = ["203.0.113.10"]
}

# CNAME record
resource "aws_route53_record" "app" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "app.example.com"
  type    = "CNAME"
  ttl     = 300
  records = ["app-lb-123456.us-east-1.elb.amazonaws.com"]
}

# Alias to ALB (no TTL, resolved at edge)
resource "aws_route53_record" "api" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "api.example.com"
  type    = "A"
  
  alias {
    name                   = aws_lb.api.dns_name
    zone_id                = aws_lb.api.zone_id
    evaluate_target_health = true
  }
}

# MX records
resource "aws_route53_record" "mx" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "example.com"
  type    = "MX"
  ttl     = 3600
  records = [
    "10 mail1.example.com",
    "20 mail2.example.com"
  ]
}

# TXT for SPF
resource "aws_route53_record" "spf" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "example.com"
  type    = "TXT"
  ttl     = 3600
  records = ["v=spf1 include:_spf.google.com ~all"]
}

Dynamic Records from Infrastructure

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Generate records from other resources
locals {
  services = {
    "api"     = aws_lb.api.dns_name
    "admin"   = aws_lb.admin.dns_name
    "docs"    = aws_cloudfront_distribution.docs.domain_name
  }
}

resource "aws_route53_record" "services" {
  for_each = local.services
  
  zone_id = aws_route53_zone.main.zone_id
  name    = "${each.key}.example.com"
  type    = "CNAME"
  ttl     = 300
  records = [each.value]
}

# From Kubernetes ingresses
data "kubernetes_ingress_v1" "all" {
  for_each = toset(["api", "web", "admin"])
  
  metadata {
    name      = each.key
    namespace = "production"
  }
}

resource "aws_route53_record" "k8s_services" {
  for_each = data.kubernetes_ingress_v1.all
  
  zone_id = aws_route53_zone.main.zone_id
  name    = "${each.key}.example.com"
  type    = "CNAME"
  ttl     = 300
  records = [each.value.status[0].load_balancer[0].ingress[0].hostname]
}

DNS Failover

Health Check Based Routing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Health check for primary
resource "aws_route53_health_check" "primary" {
  fqdn              = "primary-api.example.com"
  port              = 443
  type              = "HTTPS"
  resource_path     = "/health"
  failure_threshold = 3
  request_interval  = 30
  
  tags = {
    Name = "primary-api-health"
  }
}

# Health check for secondary
resource "aws_route53_health_check" "secondary" {
  fqdn              = "secondary-api.example.com"
  port              = 443
  type              = "HTTPS"
  resource_path     = "/health"
  failure_threshold = 3
  request_interval  = 30
  
  tags = {
    Name = "secondary-api-health"
  }
}

# Primary record with failover
resource "aws_route53_record" "api_primary" {
  zone_id         = aws_route53_zone.main.zone_id
  name            = "api.example.com"
  type            = "A"
  ttl             = 60
  records         = ["203.0.113.10"]
  set_identifier  = "primary"
  health_check_id = aws_route53_health_check.primary.id
  
  failover_routing_policy {
    type = "PRIMARY"
  }
}

# Secondary record (used when primary fails)
resource "aws_route53_record" "api_secondary" {
  zone_id        = aws_route53_zone.main.zone_id
  name           = "api.example.com"
  type           = "A"
  ttl            = 60
  records        = ["203.0.113.20"]
  set_identifier = "secondary"
  health_check_id = aws_route53_health_check.secondary.id
  
  failover_routing_policy {
    type = "SECONDARY"
  }
}

Weighted Routing for Gradual Migration

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 90% to current, 10% to new
resource "aws_route53_record" "api_current" {
  zone_id        = aws_route53_zone.main.zone_id
  name           = "api.example.com"
  type           = "A"
  ttl            = 60
  records        = ["203.0.113.10"]
  set_identifier = "current"
  
  weighted_routing_policy {
    weight = 90
  }
}

resource "aws_route53_record" "api_new" {
  zone_id        = aws_route53_zone.main.zone_id
  name           = "api.example.com"
  type           = "A"
  ttl            = 60
  records        = ["203.0.113.20"]
  set_identifier = "new"
  
  weighted_routing_policy {
    weight = 10
  }
}

External DNS for Kubernetes

Automatically create DNS records from Kubernetes resources.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# external-dns deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: external-dns
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: external-dns
  template:
    spec:
      serviceAccountName: external-dns
      containers:
      - name: external-dns
        image: registry.k8s.io/external-dns/external-dns:v0.14.0
        args:
        - --source=service
        - --source=ingress
        - --provider=aws
        - --aws-zone-type=public
        - --registry=txt
        - --txt-owner-id=my-cluster
        - --domain-filter=example.com
        env:
        - name: AWS_DEFAULT_REGION
          value: us-east-1

---
# Service with DNS annotation
apiVersion: v1
kind: Service
metadata:
  name: api
  annotations:
    external-dns.alpha.kubernetes.io/hostname: api.example.com
    external-dns.alpha.kubernetes.io/ttl: "300"
spec:
  type: LoadBalancer
  ports:
  - port: 443
  selector:
    app: api

---
# Ingress with automatic DNS
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: api
  annotations:
    external-dns.alpha.kubernetes.io/hostname: api.example.com
spec:
  rules:
  - host: api.example.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: api
            port:
              number: 80

DNS Monitoring

Resolution Checks

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# dns_monitor.py
import dns.resolver
from typing import List, Dict
import time

def check_dns_resolution(hostname: str, expected_ips: List[str] = None,
                         nameservers: List[str] = None) -> Dict:
    """Check DNS resolution for a hostname."""
    resolver = dns.resolver.Resolver()
    
    if nameservers:
        resolver.nameservers = nameservers
    
    start = time.time()
    
    try:
        answers = resolver.resolve(hostname, 'A')
        duration = time.time() - start
        
        resolved_ips = [str(rdata) for rdata in answers]
        
        result = {
            'hostname': hostname,
            'resolved': True,
            'ips': resolved_ips,
            'ttl': answers.ttl,
            'duration_ms': round(duration * 1000, 2),
            'nameserver': resolver.nameservers[0] if resolver.nameservers else 'default'
        }
        
        if expected_ips:
            result['expected_ips'] = expected_ips
            result['matches'] = set(resolved_ips) == set(expected_ips)
        
        return result
        
    except dns.resolver.NXDOMAIN:
        return {'hostname': hostname, 'resolved': False, 'error': 'NXDOMAIN'}
    except dns.resolver.NoAnswer:
        return {'hostname': hostname, 'resolved': False, 'error': 'NoAnswer'}
    except Exception as e:
        return {'hostname': hostname, 'resolved': False, 'error': str(e)}


def check_propagation(hostname: str, expected_ip: str) -> Dict:
    """Check DNS propagation across multiple nameservers."""
    public_dns = {
        'google_1': '8.8.8.8',
        'google_2': '8.8.4.4',
        'cloudflare_1': '1.1.1.1',
        'cloudflare_2': '1.0.0.1',
        'quad9': '9.9.9.9',
        'opendns': '208.67.222.222'
    }
    
    results = {}
    propagated = 0
    
    for name, ns in public_dns.items():
        result = check_dns_resolution(hostname, [expected_ip], [ns])
        results[name] = result
        if result.get('matches'):
            propagated += 1
    
    return {
        'hostname': hostname,
        'expected_ip': expected_ip,
        'propagation_percentage': (propagated / len(public_dns)) * 100,
        'results': results
    }


# Usage
result = check_propagation('api.example.com', '203.0.113.10')
print(f"Propagation: {result['propagation_percentage']}%")

Prometheus Metrics

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# dns_exporter.py
from prometheus_client import Gauge, Histogram, start_http_server
import time

dns_resolution_success = Gauge(
    'dns_resolution_success',
    'DNS resolution successful (1) or failed (0)',
    ['hostname', 'nameserver']
)

dns_resolution_duration = Histogram(
    'dns_resolution_duration_seconds',
    'DNS resolution time',
    ['hostname'],
    buckets=[.001, .005, .01, .025, .05, .1, .25, .5, 1]
)

dns_ttl = Gauge(
    'dns_record_ttl_seconds',
    'TTL of DNS record',
    ['hostname']
)

def monitor_dns(hostnames: List[str]):
    """Continuously monitor DNS resolution."""
    while True:
        for hostname in hostnames:
            result = check_dns_resolution(hostname)
            
            if result['resolved']:
                dns_resolution_success.labels(
                    hostname=hostname,
                    nameserver=result['nameserver']
                ).set(1)
                
                dns_resolution_duration.labels(
                    hostname=hostname
                ).observe(result['duration_ms'] / 1000)
                
                dns_ttl.labels(hostname=hostname).set(result['ttl'])
            else:
                dns_resolution_success.labels(
                    hostname=hostname,
                    nameserver='unknown'
                ).set(0)
        
        time.sleep(30)

# Run exporter
start_http_server(9118)
monitor_dns(['api.example.com', 'www.example.com'])

Safe DNS Changes

Pre-Change Validation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# dns_validator.py
import dns.resolver
from typing import List

def validate_dns_change(hostname: str, new_target: str, 
                        record_type: str = 'A') -> Dict:
    """Validate a DNS change before applying."""
    checks = []
    
    # 1. Check new target is reachable
    if record_type == 'A':
        # Verify IP is routable
        import socket
        try:
            socket.create_connection((new_target, 80), timeout=5)
            checks.append({'check': 'target_reachable', 'passed': True})
        except:
            checks.append({'check': 'target_reachable', 'passed': False,
                          'error': f'Cannot reach {new_target}:80'})
    
    # 2. Check for conflicting records
    try:
        existing = dns.resolver.resolve(hostname, record_type)
        checks.append({
            'check': 'existing_records',
            'current_values': [str(r) for r in existing],
            'note': 'Will be replaced'
        })
    except dns.resolver.NXDOMAIN:
        checks.append({'check': 'existing_records', 'note': 'New record'})
    
    # 3. Check CNAME conflicts
    if record_type != 'CNAME':
        try:
            dns.resolver.resolve(hostname, 'CNAME')
            checks.append({
                'check': 'cname_conflict',
                'passed': False,
                'error': 'CNAME exists, cannot add other records'
            })
        except:
            checks.append({'check': 'cname_conflict', 'passed': True})
    
    all_passed = all(c.get('passed', True) for c in checks)
    
    return {
        'hostname': hostname,
        'new_target': new_target,
        'record_type': record_type,
        'safe_to_apply': all_passed,
        'checks': checks
    }

Automated Rollback

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# dns_rollback.py
import boto3
import json
from datetime import datetime

class DNSRollback:
    def __init__(self, zone_id: str):
        self.route53 = boto3.client('route53')
        self.zone_id = zone_id
        self.snapshots_bucket = 'dns-snapshots'
        self.s3 = boto3.client('s3')
    
    def snapshot(self) -> str:
        """Take a snapshot of current DNS records."""
        records = []
        paginator = self.route53.get_paginator('list_resource_record_sets')
        
        for page in paginator.paginate(HostedZoneId=self.zone_id):
            records.extend(page['ResourceRecordSets'])
        
        snapshot_id = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
        
        self.s3.put_object(
            Bucket=self.snapshots_bucket,
            Key=f'{self.zone_id}/{snapshot_id}.json',
            Body=json.dumps(records, default=str)
        )
        
        return snapshot_id
    
    def rollback(self, snapshot_id: str):
        """Restore DNS records from a snapshot."""
        # Get snapshot
        response = self.s3.get_object(
            Bucket=self.snapshots_bucket,
            Key=f'{self.zone_id}/{snapshot_id}.json'
        )
        target_records = json.loads(response['Body'].read())
        
        # Get current records
        current_records = []
        paginator = self.route53.get_paginator('list_resource_record_sets')
        for page in paginator.paginate(HostedZoneId=self.zone_id):
            current_records.extend(page['ResourceRecordSets'])
        
        # Calculate changes needed
        changes = []
        
        # Delete records not in snapshot
        for record in current_records:
            if record not in target_records:
                changes.append({'Action': 'DELETE', 'ResourceRecordSet': record})
        
        # Upsert records from snapshot
        for record in target_records:
            changes.append({'Action': 'UPSERT', 'ResourceRecordSet': record})
        
        # Apply changes in batches
        batch_size = 100
        for i in range(0, len(changes), batch_size):
            batch = changes[i:i + batch_size]
            self.route53.change_resource_record_sets(
                HostedZoneId=self.zone_id,
                ChangeBatch={'Changes': batch}
            )


# Before making changes
rollback = DNSRollback('Z1234567890')
snapshot_id = rollback.snapshot()
print(f"Snapshot saved: {snapshot_id}")

# If something goes wrong
# rollback.rollback(snapshot_id)

Best Practices

  1. Version control everything — DNS config belongs in git
  2. Use low TTLs for changes — Drop to 60s before migrations
  3. Snapshot before changes — Enable quick rollback
  4. Monitor resolution — Alert on failures or unexpected results
  5. Use health checks — Automatic failover beats manual intervention
  6. Validate before applying — Check targets are reachable
  7. Propagation takes time — Plan for TTL delays

DNS is infrastructure. Treat it with the same rigor as your servers and code.