Terraform State Management: Keep Your Infrastructure Sane

Terraform state is the source of truth for your infrastructure. Mess it up and you’ll be manually reconciling resources at 2 AM. Here’s how to manage state properly from day one. What Is State? Terraform state maps your configuration to real resources: 1 2 3 4 5 # main.tf resource "aws_instance" "web" { ami = "ami-12345" instance_type = "t3.micro" } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 // terraform.tfstate (simplified) { "resources": [{ "type": "aws_instance", "name": "web", "instances": [{ "attributes": { "id": "i-0abc123def456", "ami": "ami-12345", "instance_type": "t3.micro" } }] }] } Without state, Terraform doesn’t know aws_instance.web corresponds to i-0abc123def456. It would try to create a new instance every time. ...

March 1, 2026 Â· 7 min Â· 1336 words Â· Rob Washington

Terraform Module Patterns for Reusable Infrastructure

Terraform modules turn infrastructure code from scripts into libraries. Here’s how to design them well. Module Structure m └ o ─ d ─ u l v ├ ├ ├ ├ └ e p ─ ─ ─ ─ ─ s c ─ ─ ─ ─ ─ / / m v o v R a a u e E i r t r A n i p s D . a u i M t b t o E f l s n . e . s m s t . d . f t t f f # # # # # R I O P D e n u r o s p t o c o u p v u u t u i m r t d e c v e n e a v r t s r a a i l r t a u e i b e q o l s u n e i s r e m e n t s Basic Module variables.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 variable "name" { description = "Name prefix for resources" type = string } variable "cidr_block" { description = "CIDR block for VPC" type = string default = "10.0.0.0/16" } variable "azs" { description = "Availability zones" type = list(string) } variable "private_subnets" { description = "Private subnet CIDR blocks" type = list(string) default = [] } variable "public_subnets" { description = "Public subnet CIDR blocks" type = list(string) default = [] } variable "tags" { description = "Tags to apply to resources" type = map(string) default = {} } main.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 resource "aws_vpc" "this" { cidr_block = var.cidr_block enable_dns_hostnames = true enable_dns_support = true tags = merge(var.tags, { Name = var.name }) } resource "aws_subnet" "private" { count = length(var.private_subnets) vpc_id = aws_vpc.this.id cidr_block = var.private_subnets[count.index] availability_zone = var.azs[count.index % length(var.azs)] tags = merge(var.tags, { Name = "${var.name}-private-${count.index + 1}" Type = "private" }) } resource "aws_subnet" "public" { count = length(var.public_subnets) vpc_id = aws_vpc.this.id cidr_block = var.public_subnets[count.index] availability_zone = var.azs[count.index % length(var.azs)] map_public_ip_on_launch = true tags = merge(var.tags, { Name = "${var.name}-public-${count.index + 1}" Type = "public" }) } outputs.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 output "vpc_id" { description = "VPC ID" value = aws_vpc.this.id } output "private_subnet_ids" { description = "Private subnet IDs" value = aws_subnet.private[*].id } output "public_subnet_ids" { description = "Public subnet IDs" value = aws_subnet.public[*].id } output "vpc_cidr_block" { description = "VPC CIDR block" value = aws_vpc.this.cidr_block } versions.tf 1 2 3 4 5 6 7 8 9 10 terraform { required_version = ">= 1.0" required_providers { aws = { source = "hashicorp/aws" version = ">= 4.0" } } } Using Modules Local Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 module "vpc" { source = "./modules/vpc" name = "production" cidr_block = "10.0.0.0/16" azs = ["us-east-1a", "us-east-1b", "us-east-1c"] private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] tags = { Environment = "production" Project = "myapp" } } # Use outputs resource "aws_instance" "app" { subnet_id = module.vpc.private_subnet_ids[0] # ... } Registry Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "5.0.0" name = "my-vpc" cidr = "10.0.0.0/16" azs = ["us-east-1a", "us-east-1b"] private_subnets = ["10.0.1.0/24", "10.0.2.0/24"] public_subnets = ["10.0.101.0/24", "10.0.102.0/24"] enable_nat_gateway = true single_nat_gateway = true } Git Module 1 2 3 4 module "vpc" { source = "git::https://github.com/org/terraform-modules.git//vpc?ref=v1.2.0" # ... } Variable Validation 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 variable "environment" { description = "Environment name" type = string validation { condition = contains(["dev", "staging", "production"], var.environment) error_message = "Environment must be dev, staging, or production." } } variable "instance_type" { description = "EC2 instance type" type = string default = "t3.micro" validation { condition = can(regex("^t3\\.", var.instance_type)) error_message = "Instance type must be t3 family." } } variable "cidr_block" { description = "CIDR block" type = string validation { condition = can(cidrhost(var.cidr_block, 0)) error_message = "Must be a valid CIDR block." } } Complex Types 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 variable "services" { description = "Service configurations" type = list(object({ name = string port = number health_path = optional(string, "/health") replicas = optional(number, 1) environment = optional(map(string), {}) })) } # Usage services = [ { name = "api" port = 8080 replicas = 3 environment = { LOG_LEVEL = "info" } }, { name = "worker" port = 9090 } ] Conditional Resources 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 variable "create_nat_gateway" { description = "Create NAT gateway" type = bool default = true } resource "aws_nat_gateway" "this" { count = var.create_nat_gateway ? 1 : 0 allocation_id = aws_eip.nat[0].id subnet_id = aws_subnet.public[0].id } resource "aws_eip" "nat" { count = var.create_nat_gateway ? 1 : 0 domain = "vpc" } Dynamic Blocks 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 variable "ingress_rules" { description = "Ingress rules" type = list(object({ port = number protocol = string cidr_blocks = list(string) })) default = [] } resource "aws_security_group" "this" { name = var.name description = var.description vpc_id = var.vpc_id dynamic "ingress" { for_each = var.ingress_rules content { from_port = ingress.value.port to_port = ingress.value.port protocol = ingress.value.protocol cidr_blocks = ingress.value.cidr_blocks } } egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } Module Composition Root Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 # main.tf module "vpc" { source = "./modules/vpc" # ... } module "security_groups" { source = "./modules/security-groups" vpc_id = module.vpc.vpc_id # ... } module "ecs_cluster" { source = "./modules/ecs-cluster" vpc_id = module.vpc.vpc_id private_subnet_ids = module.vpc.private_subnet_ids security_group_ids = [module.security_groups.app_sg_id] # ... } Shared Data 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # data.tf - Shared data sources data "aws_caller_identity" "current" {} data "aws_region" "current" {} locals { account_id = data.aws_caller_identity.current.account_id region = data.aws_region.current.name common_tags = { Environment = var.environment Project = var.project ManagedBy = "terraform" } } For_each vs Count Count (Index-based) 1 2 3 4 5 # Problematic: Adding/removing items shifts indices resource "aws_subnet" "private" { count = length(var.private_subnets) cidr_block = var.private_subnets[count.index] } For_each (Key-based) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 # Better: Keys are stable variable "subnets" { type = map(object({ cidr_block = string az = string })) } resource "aws_subnet" "private" { for_each = var.subnets cidr_block = each.value.cidr_block availability_zone = each.value.az tags = { Name = each.key } } # Usage subnets = { "private-1a" = { cidr_block = "10.0.1.0/24", az = "us-east-1a" } "private-1b" = { cidr_block = "10.0.2.0/24", az = "us-east-1b" } } Sensitive Values 1 2 3 4 5 6 7 8 9 10 11 variable "database_password" { description = "Database password" type = string sensitive = true } output "connection_string" { description = "Database connection string" value = "postgres://user:${var.database_password}@${aws_db_instance.this.endpoint}/db" sensitive = true } Module Testing Terraform Test (1.6+) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # tests/vpc_test.tftest.hcl run "vpc_creates_successfully" { command = plan variables { name = "test-vpc" cidr_block = "10.0.0.0/16" azs = ["us-east-1a"] } assert { condition = aws_vpc.this.cidr_block == "10.0.0.0/16" error_message = "VPC CIDR block incorrect" } } Run tests: ...

February 28, 2026 Â· 8 min Â· 1602 words Â· Rob Washington

Terraform State Backends: Choosing and Configuring Remote State

Local Terraform state works for learning. Production requires remote state—for team collaboration, state locking, and not losing your infrastructure when your laptop dies. Here’s how to set it up properly. Why Remote State? Local state (terraform.tfstate) has problems: No collaboration - Team members overwrite each other’s changes No locking - Concurrent applies corrupt state No backup - Laptop dies, state is gone, orphaned resources everywhere Secrets in plain text - State contains sensitive data Remote backends solve all of these. ...

February 26, 2026 Â· 7 min Â· 1383 words Â· Rob Washington

Terraform State Management: Patterns for Teams

Terraform state is the source of truth for your infrastructure. Mismanage it, and you’ll have drift, conflicts, and 3 AM incidents. These patterns keep state safe and teams productive. Why State Matters Terraform state maps your configuration to real resources. Without it, Terraform can’t: Know what already exists Calculate what needs to change Detect drift from desired state Default local state (terraform.tfstate) breaks down quickly: Can’t collaborate (who has the latest?) No locking (concurrent runs = corruption) No history (oops, we deleted production) Remote Backend: S3 + DynamoDB The standard pattern for AWS teams: ...

February 25, 2026 Â· 7 min Â· 1487 words Â· Rob Washington

Terraform State Management: Avoiding the Footguns

Terraform state is both essential and dangerous. It’s how Terraform knows what exists, what changed, and what to do. Mismanage it, and you’ll either destroy production or spend hours untangling drift. What State Actually Is State is Terraform’s record of reality. It maps your configuration to real resources: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 { "resources": [ { "type": "aws_instance", "name": "web", "instances": [ { "attributes": { "id": "i-0abc123def456", "ami": "ami-12345678", "instance_type": "t3.medium" } } ] } ] } Without state, Terraform would: ...

February 24, 2026 Â· 7 min Â· 1386 words Â· Rob Washington

Infrastructure as Code: Principles That Actually Matter

Infrastructure as Code (IaC) means your servers, networks, and services are defined in version-controlled files rather than clicked into existence through consoles. The benefits are obvious: reproducibility, auditability, collaboration. But IaC done poorly creates its own problems: state drift, copy-paste sprawl, untestable configurations. The principles matter more than the tools. Declarative Over Imperative Describe what you want, not how to get there: 1 2 3 4 5 6 7 8 9 # Declarative (Terraform) - what resource "aws_instance" "web" { ami = "ami-0c55b159cbfafe1f0" instance_type = "t3.micro" tags = { Name = "web-server" } } 1 2 3 4 5 # Imperative (script) - how aws ec2 run-instances \ --image-id ami-0c55b159cbfafe1f0 \ --instance-type t3.micro \ --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=web-server}]' Declarative code is idempotent — run it ten times, get the same result. Imperative scripts need guards against re-running. ...

February 23, 2026 Â· 6 min Â· 1251 words Â· Rob Washington

Terraform State Management: Avoiding the Footguns

Terraform state is where infrastructure-as-code meets reality. It’s also where most Terraform disasters originate. Here’s how to manage state without losing sleep. The Problem Terraform tracks what it’s created in a state file. This file maps your HCL resources to real infrastructure. Without it, Terraform can’t update or destroy anything — it doesn’t know what exists. The default is a local file called terraform.tfstate. This works fine until: Someone else needs to run Terraform Your laptop dies Two people run apply simultaneously You accidentally commit secrets to Git Rule 1: Remote State from Day One Never use local state for anything beyond experiments: ...

February 22, 2026 Â· 6 min Â· 1210 words Â· Rob Washington

Infrastructure Testing: Validating Your IaC Before Production

You test your application code. Why not your infrastructure code? Infrastructure as Code (IaC) has the same failure modes as any software: bugs, regressions, unintended side effects. Yet most teams treat Terraform and Ansible like configuration files rather than code that deserves tests. Why Infrastructure Testing Matters A Terraform plan looks correct until it: Creates a security group that’s too permissive Deploys to the wrong availability zone Sets instance types that exceed your budget Breaks networking in ways that only manifest at runtime Manual review catches some issues. Automated testing catches more. ...

February 16, 2026 Â· 6 min Â· 1115 words Â· Rob Washington

Terraform State Management: Remote Backends, Locking, and Recovery

Master Terraform state management with remote backends, state locking, workspace strategies, and recovery techniques for when things go wrong.

February 15, 2026 Â· 8 min Â· 1609 words Â· Rob Washington

DNS Management: Infrastructure as Code for Your Domain Records

DNS is the foundation everything else depends on. A misconfigured record can take down your entire infrastructure. Yet DNS is often managed through web consoles with no version control, no review process, and no automation. Let’s fix that. Terraform for DNS Route53 Basics 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 # dns.tf resource "aws_route53_zone" "main" { name = "example.com" tags = { Environment = "production" } } # A record resource "aws_route53_record" "www" { zone_id = aws_route53_zone.main.zone_id name = "www.example.com" type = "A" ttl = 300 records = ["203.0.113.10"] } # CNAME record resource "aws_route53_record" "app" { zone_id = aws_route53_zone.main.zone_id name = "app.example.com" type = "CNAME" ttl = 300 records = ["app-lb-123456.us-east-1.elb.amazonaws.com"] } # Alias to ALB (no TTL, resolved at edge) resource "aws_route53_record" "api" { zone_id = aws_route53_zone.main.zone_id name = "api.example.com" type = "A" alias { name = aws_lb.api.dns_name zone_id = aws_lb.api.zone_id evaluate_target_health = true } } # MX records resource "aws_route53_record" "mx" { zone_id = aws_route53_zone.main.zone_id name = "example.com" type = "MX" ttl = 3600 records = [ "10 mail1.example.com", "20 mail2.example.com" ] } # TXT for SPF resource "aws_route53_record" "spf" { zone_id = aws_route53_zone.main.zone_id name = "example.com" type = "TXT" ttl = 3600 records = ["v=spf1 include:_spf.google.com ~all"] } Dynamic Records from Infrastructure 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 # Generate records from other resources locals { services = { "api" = aws_lb.api.dns_name "admin" = aws_lb.admin.dns_name "docs" = aws_cloudfront_distribution.docs.domain_name } } resource "aws_route53_record" "services" { for_each = local.services zone_id = aws_route53_zone.main.zone_id name = "${each.key}.example.com" type = "CNAME" ttl = 300 records = [each.value] } # From Kubernetes ingresses data "kubernetes_ingress_v1" "all" { for_each = toset(["api", "web", "admin"]) metadata { name = each.key namespace = "production" } } resource "aws_route53_record" "k8s_services" { for_each = data.kubernetes_ingress_v1.all zone_id = aws_route53_zone.main.zone_id name = "${each.key}.example.com" type = "CNAME" ttl = 300 records = [each.value.status[0].load_balancer[0].ingress[0].hostname] } DNS Failover Health Check Based Routing 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 # Health check for primary resource "aws_route53_health_check" "primary" { fqdn = "primary-api.example.com" port = 443 type = "HTTPS" resource_path = "/health" failure_threshold = 3 request_interval = 30 tags = { Name = "primary-api-health" } } # Health check for secondary resource "aws_route53_health_check" "secondary" { fqdn = "secondary-api.example.com" port = 443 type = "HTTPS" resource_path = "/health" failure_threshold = 3 request_interval = 30 tags = { Name = "secondary-api-health" } } # Primary record with failover resource "aws_route53_record" "api_primary" { zone_id = aws_route53_zone.main.zone_id name = "api.example.com" type = "A" ttl = 60 records = ["203.0.113.10"] set_identifier = "primary" health_check_id = aws_route53_health_check.primary.id failover_routing_policy { type = "PRIMARY" } } # Secondary record (used when primary fails) resource "aws_route53_record" "api_secondary" { zone_id = aws_route53_zone.main.zone_id name = "api.example.com" type = "A" ttl = 60 records = ["203.0.113.20"] set_identifier = "secondary" health_check_id = aws_route53_health_check.secondary.id failover_routing_policy { type = "SECONDARY" } } Weighted Routing for Gradual Migration 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 # 90% to current, 10% to new resource "aws_route53_record" "api_current" { zone_id = aws_route53_zone.main.zone_id name = "api.example.com" type = "A" ttl = 60 records = ["203.0.113.10"] set_identifier = "current" weighted_routing_policy { weight = 90 } } resource "aws_route53_record" "api_new" { zone_id = aws_route53_zone.main.zone_id name = "api.example.com" type = "A" ttl = 60 records = ["203.0.113.20"] set_identifier = "new" weighted_routing_policy { weight = 10 } } External DNS for Kubernetes Automatically create DNS records from Kubernetes resources. ...

February 12, 2026 Â· 9 min Â· 1841 words Â· Rob Washington