Infrastructure code deserves the same testing rigor as application code. A typo in Terraform can delete a database. An untested Ansible role can break production. Let’s build confidence with proper testing.

The Testing Pyramid for Infrastructure

E(I(2FnREuteUlean(TlgliSerttssacatttlTtsaioeicousckndtsadTrneeeapsslltoyosusyrimcsee,nst)p)lanvalidation)

Unit Testing: Static Analysis

Terraform Validation

1
2
3
4
5
6
7
8
# Built-in validation
terraform init
terraform validate
terraform fmt -check

# Custom validation rules
terraform plan -out=tfplan
terraform show -json tfplan > plan.json
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# tests/test_terraform_plan.py
import json
import pytest

@pytest.fixture
def plan():
    with open('plan.json') as f:
        return json.load(f)

def test_no_resources_destroyed(plan):
    """Ensure no resources are being destroyed."""
    changes = plan.get('resource_changes', [])
    destroyed = [c for c in changes if 'delete' in c.get('change', {}).get('actions', [])]
    assert len(destroyed) == 0, f"Resources being destroyed: {[d['address'] for d in destroyed]}"

def test_no_public_s3_buckets(plan):
    """Ensure S3 buckets aren't public."""
    changes = plan.get('resource_changes', [])
    for change in changes:
        if change['type'] == 'aws_s3_bucket':
            after = change.get('change', {}).get('after', {})
            acl = after.get('acl', 'private')
            assert acl == 'private', f"Bucket {change['address']} has public ACL: {acl}"

def test_instances_have_tags(plan):
    """Ensure EC2 instances have required tags."""
    required_tags = {'Environment', 'Owner', 'Project'}
    changes = plan.get('resource_changes', [])
    
    for change in changes:
        if change['type'] == 'aws_instance':
            after = change.get('change', {}).get('after', {})
            tags = set(after.get('tags', {}).keys())
            missing = required_tags - tags
            assert not missing, f"Instance {change['address']} missing tags: {missing}"

Policy as Code with OPA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# policy/terraform.rego
package terraform

deny[msg] {
    resource := input.resource_changes[_]
    resource.type == "aws_security_group_rule"
    resource.change.after.cidr_blocks[_] == "0.0.0.0/0"
    resource.change.after.from_port == 22
    msg := sprintf("SSH open to world in %v", [resource.address])
}

deny[msg] {
    resource := input.resource_changes[_]
    resource.type == "aws_db_instance"
    resource.change.after.publicly_accessible == true
    msg := sprintf("RDS instance %v is publicly accessible", [resource.address])
}
1
2
# Run OPA checks
terraform show -json tfplan | opa eval -i - -d policy/ "data.terraform.deny"

Integration Testing with Terratest

Terratest deploys real infrastructure, validates it, then tears it down:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// test/vpc_test.go
package test

import (
    "testing"
    "github.com/gruntwork-io/terratest/modules/terraform"
    "github.com/gruntwork-io/terratest/modules/aws"
    "github.com/stretchr/testify/assert"
)

func TestVpcModule(t *testing.T) {
    t.Parallel()
    
    terraformOptions := &terraform.Options{
        TerraformDir: "../modules/vpc",
        Vars: map[string]interface{}{
            "vpc_cidr": "10.0.0.0/16",
            "environment": "test",
        },
    }
    
    // Clean up after test
    defer terraform.Destroy(t, terraformOptions)
    
    // Deploy infrastructure
    terraform.InitAndApply(t, terraformOptions)
    
    // Get outputs
    vpcId := terraform.Output(t, terraformOptions, "vpc_id")
    publicSubnets := terraform.OutputList(t, terraformOptions, "public_subnet_ids")
    
    // Validate VPC exists
    vpc := aws.GetVpcById(t, vpcId, "us-east-1")
    assert.Equal(t, "10.0.0.0/16", vpc.CidrBlock)
    
    // Validate subnets
    assert.Equal(t, 3, len(publicSubnets))
    
    // Validate internet gateway attached
    igw := aws.GetInternetGateway(t, vpcId, "us-east-1")
    assert.NotNil(t, igw)
}

Testing EC2 Instances

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
func TestWebServer(t *testing.T) {
    t.Parallel()
    
    terraformOptions := &terraform.Options{
        TerraformDir: "../modules/webserver",
        Vars: map[string]interface{}{
            "instance_type": "t3.micro",
        },
    }
    
    defer terraform.Destroy(t, terraformOptions)
    terraform.InitAndApply(t, terraformOptions)
    
    publicIp := terraform.Output(t, terraformOptions, "public_ip")
    
    // Test HTTP endpoint
    url := fmt.Sprintf("http://%s", publicIp)
    http_helper.HttpGetWithRetry(
        t,
        url,
        nil,
        200,
        "Hello, World",
        30,           // retries
        5*time.Second, // sleep between retries
    )
    
    // Test SSH access
    host := ssh.Host{
        Hostname:    publicIp,
        SshUserName: "ec2-user",
        SshKeyPair:  loadKeyPair(t),
    }
    
    output := ssh.CheckSshCommand(t, host, "cat /etc/os-release")
    assert.Contains(t, output, "Amazon Linux")
}

Python Testing with pytest

For Ansible and general infrastructure validation:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# tests/test_infrastructure.py
import pytest
import boto3
import requests
from fabric import Connection

class TestProductionInfrastructure:
    @pytest.fixture(scope="class")
    def aws_clients(self):
        return {
            'ec2': boto3.client('ec2'),
            'rds': boto3.client('rds'),
            's3': boto3.client('s3'),
        }
    
    def test_web_servers_healthy(self, aws_clients):
        """Verify web servers are running and healthy."""
        response = aws_clients['ec2'].describe_instances(
            Filters=[
                {'Name': 'tag:Role', 'Values': ['web']},
                {'Name': 'instance-state-name', 'Values': ['running']}
            ]
        )
        
        instances = []
        for reservation in response['Reservations']:
            instances.extend(reservation['Instances'])
        
        assert len(instances) >= 2, "Expected at least 2 web servers"
        
        for instance in instances:
            # Check instance health
            status = aws_clients['ec2'].describe_instance_status(
                InstanceIds=[instance['InstanceId']]
            )
            assert status['InstanceStatuses'][0]['InstanceState']['Name'] == 'running'
    
    def test_database_accessible(self, aws_clients):
        """Verify RDS is accessible from app tier."""
        response = aws_clients['rds'].describe_db_instances(
            DBInstanceIdentifier='production-db'
        )
        
        db = response['DBInstances'][0]
        assert db['DBInstanceStatus'] == 'available'
        assert db['PubliclyAccessible'] == False
    
    def test_load_balancer_healthy(self):
        """Verify ALB health check passes."""
        response = requests.get(
            'https://api.example.com/health',
            timeout=10
        )
        assert response.status_code == 200
        assert response.json()['status'] == 'healthy'
    
    def test_ssl_certificate_valid(self):
        """Verify SSL certificate is valid and not expiring soon."""
        import ssl
        import socket
        from datetime import datetime, timedelta
        
        hostname = 'api.example.com'
        context = ssl.create_default_context()
        
        with socket.create_connection((hostname, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                cert = ssock.getpeercert()
                
        expiry = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z')
        days_until_expiry = (expiry - datetime.utcnow()).days
        
        assert days_until_expiry > 30, f"SSL cert expires in {days_until_expiry} days"

Ansible Testing with Molecule

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# molecule/default/molecule.yml
dependency:
  name: galaxy
driver:
  name: docker
platforms:
  - name: instance
    image: amazonlinux:2
    privileged: true
provisioner:
  name: ansible
verifier:
  name: testinfra
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# molecule/default/tests/test_default.py
def test_nginx_installed(host):
    nginx = host.package("nginx")
    assert nginx.is_installed

def test_nginx_running(host):
    nginx = host.service("nginx")
    assert nginx.is_running
    assert nginx.is_enabled

def test_nginx_config_valid(host):
    cmd = host.run("nginx -t")
    assert cmd.rc == 0

def test_port_80_listening(host):
    socket = host.socket("tcp://0.0.0.0:80")
    assert socket.is_listening

def test_ssl_cert_exists(host):
    cert = host.file("/etc/nginx/ssl/server.crt")
    assert cert.exists
    assert cert.user == "root"
    assert cert.mode == 0o644

CI Pipeline Integration

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# .github/workflows/infrastructure.yml
name: Infrastructure Tests

on:
  pull_request:
    paths:
      - 'terraform/**'
      - 'ansible/**'

jobs:
  validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      
      - name: Terraform Format
        run: terraform fmt -check -recursive
        
      - name: Terraform Validate
        run: |
          terraform init -backend=false
          terraform validate
      
      - name: Run OPA Policies
        run: |
          terraform plan -out=tfplan
          terraform show -json tfplan > plan.json
          opa eval -i plan.json -d policies/ "data.terraform.deny"

  unit-tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      
      - name: Run Plan Tests
        run: |
          pip install pytest
          pytest tests/test_terraform_plan.py

  integration-tests:
    runs-on: ubuntu-latest
    if: github.event.pull_request.draft == false
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-go@v4
      
      - name: Run Terratest
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |
          cd test
          go test -v -timeout 30m

Best Practices

  1. Test in isolation — Each test should create and destroy its own resources
  2. Use unique names — Append random suffixes to avoid conflicts
  3. Set timeouts — Infrastructure operations are slow; plan accordingly
  4. Clean up on failure — Use defer or try/finally to destroy resources
  5. Test the important paths — Focus on security, connectivity, and data integrity
  6. Run expensive tests on merge — Not every commit needs full integration tests

Infrastructure testing isn’t optional—it’s how you sleep at night knowing your next deploy won’t page you at 3 AM.