Cloud

The Twelve-Factor App: What Actually Matters

The Twelve-Factor methodology is from 2011 but remains relevant. Here’s what matters in practice, and what’s become outdated. The Factors, Ranked by Impact Critical (Ignore at Your Peril) III. Config in Environment 1 2 3 4 5 # Bad DATABASE_URL = "postgres://localhost/myapp" # hardcoded # Good DATABASE_URL = os.environ["DATABASE_URL"] Config includes credentials, per-environment values, and feature flags. Environment variables work everywhere: containers, serverless, bare metal. VI. Stateless Processes 1 2 3 4 5 6 7 8 9 10 11 # Bad: storing session in memory sessions = {} @app.post("/login") def login(user): sessions[user.id] = {"logged_in": True} # Dies with process # Good: external session store @app.post("/login") def login(user): redis.set(f"session:{user.id}", {"logged_in": True}) If your process dies, can another pick up the work? Statelessness enables horizontal scaling, rolling deploys, and crash recovery. ...

Terraform Module Patterns for Reusable Infrastructure

Terraform modules turn infrastructure code from scripts into libraries. Here’s how to design them well. Module Structure m └ o ─ d ─ u l v ├ ├ ├ ├ └ e p ─ ─ ─ ─ ─ s c ─ ─ ─ ─ ─ / / m v o v R a a u e E i r t r A n i p s D . a u i M t b t o E f l s n . e . s m s t . d . f t t f f # # # # # R I O P D e n u r o s p t o c o u p v u u t u i m r t d e c v e n e a v r t s r a a i l r t a u e i b e q o l s u n e i s r e m e n t s Basic Module variables.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 variable "name" { description = "Name prefix for resources" type = string } variable "cidr_block" { description = "CIDR block for VPC" type = string default = "10.0.0.0/16" } variable "azs" { description = "Availability zones" type = list(string) } variable "private_subnets" { description = "Private subnet CIDR blocks" type = list(string) default = [] } variable "public_subnets" { description = "Public subnet CIDR blocks" type = list(string) default = [] } variable "tags" { description = "Tags to apply to resources" type = map(string) default = {} } main.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 resource "aws_vpc" "this" { cidr_block = var.cidr_block enable_dns_hostnames = true enable_dns_support = true tags = merge(var.tags, { Name = var.name }) } resource "aws_subnet" "private" { count = length(var.private_subnets) vpc_id = aws_vpc.this.id cidr_block = var.private_subnets[count.index] availability_zone = var.azs[count.index % length(var.azs)] tags = merge(var.tags, { Name = "${var.name}-private-${count.index + 1}" Type = "private" }) } resource "aws_subnet" "public" { count = length(var.public_subnets) vpc_id = aws_vpc.this.id cidr_block = var.public_subnets[count.index] availability_zone = var.azs[count.index % length(var.azs)] map_public_ip_on_launch = true tags = merge(var.tags, { Name = "${var.name}-public-${count.index + 1}" Type = "public" }) } outputs.tf 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 output "vpc_id" { description = "VPC ID" value = aws_vpc.this.id } output "private_subnet_ids" { description = "Private subnet IDs" value = aws_subnet.private[*].id } output "public_subnet_ids" { description = "Public subnet IDs" value = aws_subnet.public[*].id } output "vpc_cidr_block" { description = "VPC CIDR block" value = aws_vpc.this.cidr_block } versions.tf 1 2 3 4 5 6 7 8 9 10 terraform { required_version = ">= 1.0" required_providers { aws = { source = "hashicorp/aws" version = ">= 4.0" } } } Using Modules Local Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 module "vpc" { source = "./modules/vpc" name = "production" cidr_block = "10.0.0.0/16" azs = ["us-east-1a", "us-east-1b", "us-east-1c"] private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] tags = { Environment = "production" Project = "myapp" } } # Use outputs resource "aws_instance" "app" { subnet_id = module.vpc.private_subnet_ids[0] # ... } Registry Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "5.0.0" name = "my-vpc" cidr = "10.0.0.0/16" azs = ["us-east-1a", "us-east-1b"] private_subnets = ["10.0.1.0/24", "10.0.2.0/24"] public_subnets = ["10.0.101.0/24", "10.0.102.0/24"] enable_nat_gateway = true single_nat_gateway = true } Git Module 1 2 3 4 module "vpc" { source = "git::https://github.com/org/terraform-modules.git//vpc?ref=v1.2.0" # ... } Variable Validation 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 variable "environment" { description = "Environment name" type = string validation { condition = contains(["dev", "staging", "production"], var.environment) error_message = "Environment must be dev, staging, or production." } } variable "instance_type" { description = "EC2 instance type" type = string default = "t3.micro" validation { condition = can(regex("^t3\\.", var.instance_type)) error_message = "Instance type must be t3 family." } } variable "cidr_block" { description = "CIDR block" type = string validation { condition = can(cidrhost(var.cidr_block, 0)) error_message = "Must be a valid CIDR block." } } Complex Types 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 variable "services" { description = "Service configurations" type = list(object({ name = string port = number health_path = optional(string, "/health") replicas = optional(number, 1) environment = optional(map(string), {}) })) } # Usage services = [ { name = "api" port = 8080 replicas = 3 environment = { LOG_LEVEL = "info" } }, { name = "worker" port = 9090 } ] Conditional Resources 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 variable "create_nat_gateway" { description = "Create NAT gateway" type = bool default = true } resource "aws_nat_gateway" "this" { count = var.create_nat_gateway ? 1 : 0 allocation_id = aws_eip.nat[0].id subnet_id = aws_subnet.public[0].id } resource "aws_eip" "nat" { count = var.create_nat_gateway ? 1 : 0 domain = "vpc" } Dynamic Blocks 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 variable "ingress_rules" { description = "Ingress rules" type = list(object({ port = number protocol = string cidr_blocks = list(string) })) default = [] } resource "aws_security_group" "this" { name = var.name description = var.description vpc_id = var.vpc_id dynamic "ingress" { for_each = var.ingress_rules content { from_port = ingress.value.port to_port = ingress.value.port protocol = ingress.value.protocol cidr_blocks = ingress.value.cidr_blocks } } egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } Module Composition Root Module 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 # main.tf module "vpc" { source = "./modules/vpc" # ... } module "security_groups" { source = "./modules/security-groups" vpc_id = module.vpc.vpc_id # ... } module "ecs_cluster" { source = "./modules/ecs-cluster" vpc_id = module.vpc.vpc_id private_subnet_ids = module.vpc.private_subnet_ids security_group_ids = [module.security_groups.app_sg_id] # ... } Shared Data 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # data.tf - Shared data sources data "aws_caller_identity" "current" {} data "aws_region" "current" {} locals { account_id = data.aws_caller_identity.current.account_id region = data.aws_region.current.name common_tags = { Environment = var.environment Project = var.project ManagedBy = "terraform" } } For_each vs Count Count (Index-based) 1 2 3 4 5 # Problematic: Adding/removing items shifts indices resource "aws_subnet" "private" { count = length(var.private_subnets) cidr_block = var.private_subnets[count.index] } For_each (Key-based) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 # Better: Keys are stable variable "subnets" { type = map(object({ cidr_block = string az = string })) } resource "aws_subnet" "private" { for_each = var.subnets cidr_block = each.value.cidr_block availability_zone = each.value.az tags = { Name = each.key } } # Usage subnets = { "private-1a" = { cidr_block = "10.0.1.0/24", az = "us-east-1a" } "private-1b" = { cidr_block = "10.0.2.0/24", az = "us-east-1b" } } Sensitive Values 1 2 3 4 5 6 7 8 9 10 11 variable "database_password" { description = "Database password" type = string sensitive = true } output "connection_string" { description = "Database connection string" value = "postgres://user:${var.database_password}@${aws_db_instance.this.endpoint}/db" sensitive = true } Module Testing Terraform Test (1.6+) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # tests/vpc_test.tftest.hcl run "vpc_creates_successfully" { command = plan variables { name = "test-vpc" cidr_block = "10.0.0.0/16" azs = ["us-east-1a"] } assert { condition = aws_vpc.this.cidr_block == "10.0.0.0/16" error_message = "VPC CIDR block incorrect" } } Run tests: ...

AWS CLI Power User: Queries, Filters, and Automation

The AWS Console is fine for exploration. For real work—auditing, automation, bulk operations—the CLI is essential. Here’s how to use it effectively. Output Formats 1 2 3 4 5 6 7 8 9 10 11 # JSON (default, best for scripting) aws ec2 describe-instances --output json # Table (human readable) aws ec2 describe-instances --output table # Text (tab-separated, grep-friendly) aws ec2 describe-instances --output text # YAML aws ec2 describe-instances --output yaml Set default in ~/.aws/config: ...

AWS CLI Essentials: Patterns for Daily Operations

The AWS CLI is the fastest path from question to answer. These patterns cover the operations you’ll use daily. Setup and Configuration 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # Configure default profile aws configure # Configure named profile aws configure --profile production # Use specific profile aws --profile production ec2 describe-instances # Or set environment variable export AWS_PROFILE=production # Verify identity aws sts get-caller-identity Multiple Accounts 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # ~/.aws/credentials [default] aws_access_key_id = AKIA... aws_secret_access_key = ... [production] aws_access_key_id = AKIA... aws_secret_access_key = ... # ~/.aws/config [default] region = us-east-1 output = json [profile production] region = us-west-2 output = json EC2 Operations List Instances 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # All instances aws ec2 describe-instances # Just the essentials aws ec2 describe-instances \ --query 'Reservations[].Instances[].[InstanceId,State.Name,InstanceType,PrivateIpAddress,Tags[?Key==`Name`].Value|[0]]' \ --output table # Running instances only aws ec2 describe-instances \ --filters "Name=instance-state-name,Values=running" \ --query 'Reservations[].Instances[].[InstanceId,PrivateIpAddress]' \ --output text # By tag aws ec2 describe-instances \ --filters "Name=tag:Environment,Values=production" # By instance ID aws ec2 describe-instances --instance-ids i-1234567890abcdef0 Start/Stop/Reboot 1 2 3 4 5 6 7 8 9 10 11 # Stop aws ec2 stop-instances --instance-ids i-1234567890abcdef0 # Start aws ec2 start-instances --instance-ids i-1234567890abcdef0 # Reboot aws ec2 reboot-instances --instance-ids i-1234567890abcdef0 # Terminate (careful!) aws ec2 terminate-instances --instance-ids i-1234567890abcdef0 Get Console Output 1 aws ec2 get-console-output --instance-id i-1234567890abcdef0 --output text SSH Key Pairs 1 2 3 4 5 6 7 8 9 # List aws ec2 describe-key-pairs # Create aws ec2 create-key-pair --key-name mykey --query 'KeyMaterial' --output text > mykey.pem chmod 400 mykey.pem # Delete aws ec2 delete-key-pair --key-name mykey S3 Operations List and Navigate 1 2 3 4 5 6 7 8 9 10 11 # List buckets aws s3 ls # List bucket contents aws s3 ls s3://mybucket/ # Recursive listing aws s3 ls s3://mybucket/ --recursive # With human-readable sizes aws s3 ls s3://mybucket/ --recursive --human-readable --summarize Copy and Sync 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # Upload file aws s3 cp myfile.txt s3://mybucket/ # Download file aws s3 cp s3://mybucket/myfile.txt ./ # Upload directory aws s3 cp ./mydir s3://mybucket/mydir --recursive # Sync (only changed files) aws s3 sync ./local s3://mybucket/remote # Sync with delete (mirror) aws s3 sync ./local s3://mybucket/remote --delete # Exclude patterns aws s3 sync ./local s3://mybucket/remote --exclude "*.log" --exclude ".git/*" Delete 1 2 3 4 5 6 7 8 # Single file aws s3 rm s3://mybucket/myfile.txt # Directory aws s3 rm s3://mybucket/mydir/ --recursive # Empty bucket aws s3 rm s3://mybucket/ --recursive Presigned URLs 1 2 3 4 5 # Generate download URL (expires in 1 hour) aws s3 presign s3://mybucket/myfile.txt --expires-in 3600 # Upload URL aws s3 presign s3://mybucket/upload.txt --expires-in 3600 Bucket Operations 1 2 3 4 5 6 7 8 # Create bucket aws s3 mb s3://mynewbucket # Delete bucket (must be empty) aws s3 rb s3://mybucket # Force delete (removes contents first) aws s3 rb s3://mybucket --force IAM Operations Users 1 2 3 4 5 6 7 8 9 10 11 # List users aws iam list-users # Create user aws iam create-user --user-name newuser # Delete user aws iam delete-user --user-name olduser # List user's access keys aws iam list-access-keys --user-name myuser Roles 1 2 3 4 5 6 7 8 # List roles aws iam list-roles # Get role details aws iam get-role --role-name MyRole # List attached policies aws iam list-attached-role-policies --role-name MyRole Policies 1 2 3 4 5 6 7 # List policies aws iam list-policies --scope Local # Get policy document aws iam get-policy-version \ --policy-arn arn:aws:iam::123456789012:policy/MyPolicy \ --version-id v1 CloudWatch Logs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # List log groups aws logs describe-log-groups # List log streams aws logs describe-log-streams --log-group-name /aws/lambda/myfunction # Get recent logs aws logs get-log-events \ --log-group-name /aws/lambda/myfunction \ --log-stream-name '2024/01/01/[$LATEST]abc123' \ --limit 100 # Tail logs (requires aws-cli v2) aws logs tail /aws/lambda/myfunction --follow # Filter logs aws logs filter-log-events \ --log-group-name /aws/lambda/myfunction \ --filter-pattern "ERROR" \ --start-time $(date -d '1 hour ago' +%s)000 Lambda 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # List functions aws lambda list-functions # Invoke function aws lambda invoke \ --function-name myfunction \ --payload '{"key": "value"}' \ output.json # Get function config aws lambda get-function-configuration --function-name myfunction # Update function code aws lambda update-function-code \ --function-name myfunction \ --zip-file fileb://function.zip # View recent invocations aws lambda get-function --function-name myfunction RDS 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # List instances aws rds describe-db-instances # Instance details aws rds describe-db-instances --db-instance-identifier mydb \ --query 'DBInstances[0].[DBInstanceIdentifier,DBInstanceStatus,Endpoint.Address]' # Create snapshot aws rds create-db-snapshot \ --db-instance-identifier mydb \ --db-snapshot-identifier mydb-snapshot-$(date +%Y%m%d) # List snapshots aws rds describe-db-snapshots --db-instance-identifier mydb Secrets Manager 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # List secrets aws secretsmanager list-secrets # Get secret value aws secretsmanager get-secret-value --secret-id mysecret \ --query 'SecretString' --output text # Create secret aws secretsmanager create-secret \ --name mysecret \ --secret-string '{"username":"admin","password":"secret"}' # Update secret aws secretsmanager put-secret-value \ --secret-id mysecret \ --secret-string '{"username":"admin","password":"newsecret"}' SSM Parameter Store 1 2 3 4 5 6 7 8 9 10 11 12 # Get parameter aws ssm get-parameter --name /myapp/database/password --with-decryption # Put parameter aws ssm put-parameter \ --name /myapp/database/password \ --value "mysecret" \ --type SecureString \ --overwrite # List parameters by path aws ssm get-parameters-by-path --path /myapp/ --recursive --with-decryption Query and Filter Patterns JMESPath Queries 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # Select specific fields aws ec2 describe-instances \ --query 'Reservations[].Instances[].[InstanceId,State.Name]' # Filter in query aws ec2 describe-instances \ --query 'Reservations[].Instances[?State.Name==`running`].[InstanceId]' # First result only aws ec2 describe-instances \ --query 'Reservations[0].Instances[0].InstanceId' # Flatten nested arrays aws ec2 describe-instances \ --query 'Reservations[].Instances[].Tags[?Key==`Name`].Value[]' Output Formats 1 2 3 4 5 6 7 8 9 10 11 # JSON (default) aws ec2 describe-instances --output json # Table (human readable) aws ec2 describe-instances --output table # Text (tab-separated, good for scripts) aws ec2 describe-instances --output text # YAML aws ec2 describe-instances --output yaml Scripting Patterns Loop Through Resources 1 2 3 4 5 6 7 8 # Stop all instances with specific tag for id in $(aws ec2 describe-instances \ --filters "Name=tag:Environment,Values=dev" \ --query 'Reservations[].Instances[].InstanceId' \ --output text); do echo "Stopping $id" aws ec2 stop-instances --instance-ids "$id" done Wait for State 1 2 3 4 5 6 7 8 # Wait for instance to be running aws ec2 wait instance-running --instance-ids i-1234567890abcdef0 # Wait for instance to stop aws ec2 wait instance-stopped --instance-ids i-1234567890abcdef0 # Wait for snapshot completion aws ec2 wait snapshot-completed --snapshot-ids snap-1234567890abcdef0 Pagination 1 2 3 4 5 6 7 # Auto-pagination (default in CLI v2) aws s3api list-objects-v2 --bucket mybucket # Manual pagination aws s3api list-objects-v2 --bucket mybucket --max-items 100 # Use NextToken from output for next page aws s3api list-objects-v2 --bucket mybucket --starting-token "token..." Useful Aliases 1 2 3 4 5 6 7 8 9 10 # ~/.bashrc or ~/.zshrc # Quick instance list alias ec2ls='aws ec2 describe-instances --query "Reservations[].Instances[].[InstanceId,State.Name,InstanceType,PrivateIpAddress,Tags[?Key==\`Name\`].Value|[0]]" --output table' # Who am I? alias awswho='aws sts get-caller-identity' # S3 bucket sizes alias s3sizes='aws s3 ls | while read _ _ bucket; do aws s3 ls s3://$bucket --recursive --summarize 2>/dev/null | tail -1; echo $bucket; done' Troubleshooting 1 2 3 4 5 6 7 8 9 10 11 # Debug mode aws ec2 describe-instances --debug # Dry run (check permissions without executing) aws ec2 run-instances --dry-run --image-id ami-12345 --instance-type t2.micro # Check CLI version aws --version # Clear credential cache rm -rf ~/.aws/cli/cache/* The AWS CLI rewards muscle memory. Start with the operations you do daily, build aliases for common patterns, and gradually expand. ...

The Twelve-Factor App: Principles for Cloud-Native Applications

The twelve-factor app methodology emerged from Heroku’s experience deploying thousands of applications. These principles create applications that work well in modern cloud environments — containerized, horizontally scalable, and continuously deployed. They’re not arbitrary rules. Each factor solves a real problem. I. Codebase One codebase tracked in version control, many deploys. ✅ m ├ ├ └ ❌ m m m y ─ ─ ─ y y y G a ─ ─ ─ B a a a o p a p p p o p . s d d p p p d g r e : - - - : i c p p s d t l r t e o o a v y d g e u i l t c n o o t g p : i m e s n n t / t a / g i n g , p r o d u c t i o n , f e a t u r e - b r a n c h e s Different environments come from the same codebase. Configuration, not code, varies between deploys. ...

Terraform State Management: Avoiding the Footguns

Terraform state is where infrastructure-as-code meets reality. It’s also where most Terraform disasters originate. Here’s how to manage state without losing sleep. The Problem Terraform tracks what it’s created in a state file. This file maps your HCL resources to real infrastructure. Without it, Terraform can’t update or destroy anything — it doesn’t know what exists. The default is a local file called terraform.tfstate. This works fine until: Someone else needs to run Terraform Your laptop dies Two people run apply simultaneously You accidentally commit secrets to Git Rule 1: Remote State from Day One Never use local state for anything beyond experiments: ...

Serverless Cold Start Mitigation: Practical Patterns That Actually Work

Cold starts are serverless’s original sin. Your function spins up, downloads dependencies, initializes connections, and finally runs your code — all while your user waits. The P99 latency spikes. The SLA teeters. Here’s what actually works, ranked by effectiveness and cost. Understanding the Cold Start A cold start happens when there’s no warm instance available to handle a request. The platform must: Provision a container — 50-500ms depending on runtime size Initialize the runtime — 10-100ms (Python) to 500ms+ (JVM without optimization) Run your initialization code — depends on what you do at module level Execute the handler — your actual function 1 2 3 4 5 6 7 8 9 10 11 12 # Everything at module level runs during cold start import boto3 # ~100ms import pandas # ~500ms import torch # ~2000ms # Connection initialization during cold start dynamodb = boto3.resource('dynamodb') table = dynamodb.Table('users') def handler(event, context): # Only this runs on warm invocations return table.get_item(Key={'id': event['user_id']}) Measured cold start times for AWS Lambda (1024MB, us-east-1): ...

Cloud Cost Optimization: Cutting AWS Bills Without Cutting Corners

Cloud bills grow faster than you’d expect. A few forgotten instances here, oversized databases there, and suddenly you’re spending more on infrastructure than engineering salaries. Let’s fix that. Quick Wins: Find the Waste Identify Unused Resources 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 # find_waste.py import boto3 from datetime import datetime, timedelta ec2 = boto3.client('ec2') cloudwatch = boto3.client('cloudwatch') def find_idle_instances(): """Find EC2 instances with <5% CPU over 7 days.""" instances = ec2.describe_instances( Filters=[{'Name': 'instance-state-name', 'Values': ['running']}] ) idle = [] for reservation in instances['Reservations']: for instance in reservation['Instances']: instance_id = instance['InstanceId'] # Get average CPU over 7 days response = cloudwatch.get_metric_statistics( Namespace='AWS/EC2', MetricName='CPUUtilization', Dimensions=[{'Name': 'InstanceId', 'Value': instance_id}], StartTime=datetime.utcnow() - timedelta(days=7), EndTime=datetime.utcnow(), Period=86400, # 1 day Statistics=['Average'] ) if response['Datapoints']: avg_cpu = sum(d['Average'] for d in response['Datapoints']) / len(response['Datapoints']) if avg_cpu < 5: idle.append({ 'instance_id': instance_id, 'type': instance['InstanceType'], 'avg_cpu': round(avg_cpu, 2), 'name': next((t['Value'] for t in instance.get('Tags', []) if t['Key'] == 'Name'), 'unnamed') }) return idle def find_unattached_volumes(): """Find EBS volumes not attached to any instance.""" volumes = ec2.describe_volumes( Filters=[{'Name': 'status', 'Values': ['available']}] ) return [{ 'volume_id': v['VolumeId'], 'size_gb': v['Size'], 'monthly_cost': v['Size'] * 0.10 # gp2 pricing estimate } for v in volumes['Volumes']] def find_old_snapshots(): """Find snapshots older than 90 days.""" snapshots = ec2.describe_snapshots(OwnerIds=['self']) cutoff = datetime.utcnow() - timedelta(days=90) old = [] for snap in snapshots['Snapshots']: if snap['StartTime'].replace(tzinfo=None) < cutoff: old.append({ 'snapshot_id': snap['SnapshotId'], 'size_gb': snap['VolumeSize'], 'age_days': (datetime.utcnow() - snap['StartTime'].replace(tzinfo=None)).days }) return old # Run analysis print("=== Idle Instances ===") for i in find_idle_instances(): print(f" {i['instance_id']} ({i['type']}): {i['avg_cpu']}% avg CPU - {i['name']}") print("\n=== Unattached Volumes ===") for v in find_unattached_volumes(): print(f" {v['volume_id']}: {v['size_gb']}GB = ${v['monthly_cost']:.2f}/mo") print("\n=== Old Snapshots ===") for s in find_old_snapshots(): print(f" {s['snapshot_id']}: {s['size_gb']}GB, {s['age_days']} days old") Automated Cleanup 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 # cleanup_automation.py import boto3 from datetime import datetime, timedelta def cleanup_old_amis(days_old: int = 180, dry_run: bool = True): """Deregister AMIs older than threshold and delete their snapshots.""" ec2 = boto3.client('ec2') images = ec2.describe_images(Owners=['self']) cutoff = datetime.utcnow() - timedelta(days=days_old) for image in images['Images']: creation_date = datetime.strptime(image['CreationDate'][:10], '%Y-%m-%d') if creation_date < cutoff: ami_id = image['ImageId'] snapshot_ids = [ bdm['Ebs']['SnapshotId'] for bdm in image.get('BlockDeviceMappings', []) if 'Ebs' in bdm ] if dry_run: print(f"Would delete AMI {ami_id} and snapshots {snapshot_ids}") else: ec2.deregister_image(ImageId=ami_id) for snap_id in snapshot_ids: ec2.delete_snapshot(SnapshotId=snap_id) print(f"Deleted AMI {ami_id} and {len(snapshot_ids)} snapshots") def stop_dev_instances_at_night(): """Stop non-production instances outside business hours.""" ec2 = boto3.resource('ec2') # Find instances tagged Environment=dev|staging instances = ec2.instances.filter( Filters=[ {'Name': 'tag:Environment', 'Values': ['dev', 'staging']}, {'Name': 'instance-state-name', 'Values': ['running']} ] ) instance_ids = [i.id for i in instances] if instance_ids: ec2.instances.filter(InstanceIds=instance_ids).stop() print(f"Stopped {len(instance_ids)} dev/staging instances") 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 # Lambda + EventBridge for scheduled cleanup AWSTemplateFormatVersion: '2010-09-09' Resources: StopDevInstancesFunction: Type: AWS::Lambda::Function Properties: FunctionName: stop-dev-instances Runtime: python3.11 Handler: index.handler Timeout: 60 Role: !GetAtt LambdaRole.Arn Code: ZipFile: | import boto3 def handler(event, context): ec2 = boto3.resource('ec2') instances = ec2.instances.filter( Filters=[ {'Name': 'tag:AutoStop', 'Values': ['true']}, {'Name': 'instance-state-name', 'Values': ['running']} ] ) ids = [i.id for i in instances] if ids: ec2.instances.filter(InstanceIds=ids).stop() return {'stopped': ids} StopSchedule: Type: AWS::Events::Rule Properties: ScheduleExpression: 'cron(0 22 ? * MON-FRI *)' # 10 PM weekdays Targets: - Id: StopDevInstances Arn: !GetAtt StopDevInstancesFunction.Arn Right-Sizing Analyze and Recommend 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 # rightsizing.py import boto3 def get_rightsizing_recommendations(): """Get AWS Compute Optimizer recommendations.""" optimizer = boto3.client('compute-optimizer') response = optimizer.get_ec2_instance_recommendations() savings = [] for rec in response.get('instanceRecommendations', []): current = rec['currentInstanceType'] for option in rec.get('recommendationOptions', []): if option.get('rank') == 1: # Top recommendation recommended = option['instanceType'] monthly_savings = ( rec.get('utilizationMetrics', [{}])[0].get('value', 0) * option.get('projectedUtilizationMetrics', [{}])[0].get('value', 1) ) savings.append({ 'instance_id': rec['instanceArn'].split('/')[-1], 'current': current, 'recommended': recommended, 'finding': rec['finding'], 'estimated_savings': option.get('savingsOpportunity', {}) }) return savings # Generate report for rec in get_rightsizing_recommendations(): print(f"{rec['instance_id']}: {rec['current']} → {rec['recommended']}") print(f" Finding: {rec['finding']}") if rec['estimated_savings']: print(f" Potential savings: ${rec['estimated_savings'].get('estimatedMonthlySavings', {}).get('value', 0):.2f}/mo") Reserved Instances & Savings Plans Analyze Coverage 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 # reservation_analysis.py import boto3 from collections import defaultdict def analyze_reservation_coverage(): """Analyze current RI/SP coverage and recommend purchases.""" ce = boto3.client('ce') # Get coverage report response = ce.get_reservation_coverage( TimePeriod={ 'Start': '2026-01-01', 'End': '2026-02-01' }, Granularity='MONTHLY', GroupBy=[ {'Type': 'DIMENSION', 'Key': 'INSTANCE_TYPE'} ] ) recommendations = [] for group in response.get('CoveragesByTime', [{}])[0].get('Groups', []): instance_type = group['Attributes']['instanceType'] coverage = float(group['Coverage']['CoverageHours']['CoverageHoursPercentage']) on_demand_hours = float(group['Coverage']['CoverageHours']['OnDemandHours']) if coverage < 70 and on_demand_hours > 500: recommendations.append({ 'instance_type': instance_type, 'current_coverage': coverage, 'on_demand_hours': on_demand_hours, 'recommendation': 'Consider Reserved Instances' }) return recommendations def get_savings_plan_recommendations(): """Get Savings Plan purchase recommendations.""" ce = boto3.client('ce') response = ce.get_savings_plans_purchase_recommendation( SavingsPlansType='COMPUTE_SP', TermInYears='ONE_YEAR', PaymentOption='NO_UPFRONT', LookbackPeriodInDays='THIRTY_DAYS' ) rec = response.get('SavingsPlansPurchaseRecommendation', {}) return { 'recommended_hourly_commitment': rec.get('SavingsPlansPurchaseRecommendationSummary', {}).get('RecommendedHourlyCommitment'), 'estimated_monthly_savings': rec.get('SavingsPlansPurchaseRecommendationSummary', {}).get('EstimatedMonthlySavingsAmount'), 'estimated_savings_percentage': rec.get('SavingsPlansPurchaseRecommendationSummary', {}).get('EstimatedSavingsPercentage') } Spot Instances Spot for Stateless Workloads 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 # Kubernetes with Spot instances via Karpenter apiVersion: karpenter.sh/v1alpha5 kind: Provisioner metadata: name: spot-provisioner spec: requirements: - key: karpenter.sh/capacity-type operator: In values: ["spot"] - key: kubernetes.io/arch operator: In values: ["amd64"] - key: node.kubernetes.io/instance-type operator: In values: ["m5.large", "m5.xlarge", "m5a.large", "m5a.xlarge", "m6i.large"] # Spread across instance types for availability limits: resources: cpu: 1000 # Handle interruptions gracefully ttlSecondsAfterEmpty: 30 ttlSecondsUntilExpired: 2592000 # 30 days --- # Deployment using spot nodes apiVersion: apps/v1 kind: Deployment metadata: name: worker spec: replicas: 10 template: spec: nodeSelector: karpenter.sh/capacity-type: spot # Handle spot interruptions terminationGracePeriodSeconds: 120 containers: - name: worker lifecycle: preStop: exec: command: ["/bin/sh", "-c", "sleep 90"] # Drain gracefully Spot Interruption Handling 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 # spot_interruption_handler.py import requests import time import signal import sys METADATA_URL = "http://169.254.169.254/latest/meta-data" def check_spot_interruption(): """Check if this spot instance is being interrupted.""" try: response = requests.get( f"{METADATA_URL}/spot/instance-action", timeout=1 ) if response.status_code == 200: return response.json() except: pass return None def graceful_shutdown(): """Handle graceful shutdown on interruption.""" print("Spot interruption detected, starting graceful shutdown...") # Stop accepting new work # Finish current tasks # Checkpoint state # Clean up sys.exit(0) def monitor_interruption(): """Monitor for spot interruption (2 minute warning).""" while True: interruption = check_spot_interruption() if interruption: print(f"Interruption notice: {interruption}") graceful_shutdown() time.sleep(5) # Run in background thread import threading threading.Thread(target=monitor_interruption, daemon=True).start() Storage Optimization S3 Lifecycle Policies 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 # s3_lifecycle.py import boto3 def configure_lifecycle_rules(bucket_name: str): """Configure intelligent tiering and expiration.""" s3 = boto3.client('s3') lifecycle_config = { 'Rules': [ { 'ID': 'intelligent-tiering', 'Status': 'Enabled', 'Filter': {'Prefix': ''}, 'Transitions': [ { 'Days': 0, 'StorageClass': 'INTELLIGENT_TIERING' } ] }, { 'ID': 'archive-old-logs', 'Status': 'Enabled', 'Filter': {'Prefix': 'logs/'}, 'Transitions': [ {'Days': 30, 'StorageClass': 'STANDARD_IA'}, {'Days': 90, 'StorageClass': 'GLACIER'}, ], 'Expiration': {'Days': 365} }, { 'ID': 'cleanup-incomplete-uploads', 'Status': 'Enabled', 'Filter': {'Prefix': ''}, 'AbortIncompleteMultipartUpload': { 'DaysAfterInitiation': 7 } } ] } s3.put_bucket_lifecycle_configuration( Bucket=bucket_name, LifecycleConfiguration=lifecycle_config ) Cost Monitoring Budget Alerts 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 # CloudFormation budget AWSTemplateFormatVersion: '2010-09-09' Resources: MonthlyBudget: Type: AWS::Budgets::Budget Properties: Budget: BudgetName: monthly-infrastructure BudgetLimit: Amount: 10000 Unit: USD TimeUnit: MONTHLY BudgetType: COST NotificationsWithSubscribers: - Notification: NotificationType: ACTUAL ComparisonOperator: GREATER_THAN Threshold: 80 Subscribers: - SubscriptionType: EMAIL Address: ops@example.com - Notification: NotificationType: FORECASTED ComparisonOperator: GREATER_THAN Threshold: 100 Subscribers: - SubscriptionType: SNS Address: !Ref AlertTopic Daily Cost Report 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 # daily_cost_report.py import boto3 from datetime import datetime, timedelta def get_daily_costs(): """Get costs broken down by service.""" ce = boto3.client('ce') end = datetime.utcnow().date() start = end - timedelta(days=7) response = ce.get_cost_and_usage( TimePeriod={ 'Start': start.isoformat(), 'End': end.isoformat() }, Granularity='DAILY', Metrics=['UnblendedCost'], GroupBy=[ {'Type': 'DIMENSION', 'Key': 'SERVICE'} ] ) report = [] for day in response['ResultsByTime']: date = day['TimePeriod']['Start'] for group in day['Groups']: service = group['Keys'][0] cost = float(group['Metrics']['UnblendedCost']['Amount']) if cost > 1: # Only show significant costs report.append({ 'date': date, 'service': service, 'cost': cost }) return report # Generate and send report for item in sorted(get_daily_costs(), key=lambda x: -x['cost'])[:10]: print(f"{item['date']} | {item['service']}: ${item['cost']:.2f}") Checklist Enable Cost Explorer and set up budgets Tag all resources for cost allocation Review and act on right-sizing recommendations monthly Implement lifecycle policies for S3 and EBS snapshots Use Spot for fault-tolerant workloads Purchase Savings Plans for baseline compute Stop dev/staging resources outside business hours Clean up unused resources weekly Review Reserved Instance coverage quarterly Cloud cost optimization isn’t a one-time project—it’s an ongoing practice. Automate the easy wins, review monthly, and treat your cloud bill like any other metric that matters. ...

Serverless Architecture: When to Use It (And When Not To)

A practical guide to serverless architecture — the real benefits, hidden costs, and patterns that actually work in production.

Kubernetes: Container Orchestration for the Rest of Us

A practical introduction to Kubernetes — what it does, why you might need it, and how to get started without drowning in complexity.