Python Asyncio Patterns: Concurrency Without the Headaches

Asyncio enables concurrent I/O without threads. These patterns help you use it effectively without falling into common traps. Basic Structure 1 2 3 4 5 6 7 8 9 import asyncio async def main(): print("Hello") await asyncio.sleep(1) print("World") # Python 3.7+ asyncio.run(main()) HTTP Requests with aiohttp 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 import aiohttp import asyncio async def fetch(session, url): async with session.get(url) as response: return await response.text() async def fetch_all(urls): async with aiohttp.ClientSession() as session: tasks = [fetch(session, url) for url in urls] return await asyncio.gather(*tasks) # Usage urls = [ "https://api.example.com/users", "https://api.example.com/posts", "https://api.example.com/comments", ] results = asyncio.run(fetch_all(urls)) Task Management Running Tasks Concurrently 1 2 3 4 5 6 7 8 9 10 11 12 13 14 async def task_a(): await asyncio.sleep(2) return "A done" async def task_b(): await asyncio.sleep(1) return "B done" async def main(): # Run concurrently, wait for all results = await asyncio.gather(task_a(), task_b()) print(results) # ['A done', 'B done'] - takes ~2s total, not 3s asyncio.run(main()) Handle Exceptions in gather 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 async def might_fail(n): if n == 2: raise ValueError("Task 2 failed") await asyncio.sleep(n) return f"Task {n} done" async def main(): # return_exceptions=True prevents one failure from canceling others results = await asyncio.gather( might_fail(1), might_fail(2), might_fail(3), return_exceptions=True ) for result in results: if isinstance(result, Exception): print(f"Error: {result}") else: print(result) asyncio.run(main()) First Completed 1 2 3 4 5 6 7 8 9 10 11 12 13 14 async def main(): tasks = [ asyncio.create_task(fetch(session, url1)), asyncio.create_task(fetch(session, url2)), ] # Return when first completes done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) # Cancel remaining for task in pending: task.cancel() return done.pop().result() Timeout 1 2 3 4 5 6 7 8 9 10 11 async def slow_operation(): await asyncio.sleep(10) return "done" async def main(): try: result = await asyncio.wait_for(slow_operation(), timeout=5.0) except asyncio.TimeoutError: print("Operation timed out") asyncio.run(main()) Semaphores (Limiting Concurrency) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 async def fetch_with_limit(session, url, semaphore): async with semaphore: async with session.get(url) as response: return await response.text() async def main(): semaphore = asyncio.Semaphore(10) # Max 10 concurrent requests async with aiohttp.ClientSession() as session: tasks = [ fetch_with_limit(session, url, semaphore) for url in urls ] results = await asyncio.gather(*tasks) Queues for Producer/Consumer 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 async def producer(queue, items): for item in items: await queue.put(item) print(f"Produced: {item}") # Signal completion await queue.put(None) async def consumer(queue, name): while True: item = await queue.get() if item is None: queue.task_done() break print(f"{name} processing: {item}") await asyncio.sleep(1) # Simulate work queue.task_done() async def main(): queue = asyncio.Queue(maxsize=10) # Start producer and multiple consumers await asyncio.gather( producer(queue, range(20)), consumer(queue, "Worker-1"), consumer(queue, "Worker-2"), ) asyncio.run(main()) Error Handling Patterns Task Exception Handling 1 2 3 4 5 6 7 8 9 10 11 12 13 async def risky_task(): await asyncio.sleep(1) raise ValueError("Something went wrong") async def main(): task = asyncio.create_task(risky_task()) try: await task except ValueError as e: print(f"Caught: {e}") asyncio.run(main()) Background Task Exceptions 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def handle_exception(loop, context): msg = context.get("exception", context["message"]) print(f"Caught exception: {msg}") async def background_task(): await asyncio.sleep(1) raise RuntimeError("Background failure") async def main(): loop = asyncio.get_event_loop() loop.set_exception_handler(handle_exception) # Fire and forget - exception won't crash main asyncio.create_task(background_task()) await asyncio.sleep(5) asyncio.run(main()) Context Managers 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 import asyncio from contextlib import asynccontextmanager @asynccontextmanager async def managed_resource(): print("Acquiring resource") resource = await create_resource() try: yield resource finally: print("Releasing resource") await resource.close() async def main(): async with managed_resource() as resource: await resource.do_something() Running Blocking Code 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 import asyncio from concurrent.futures import ThreadPoolExecutor def blocking_io(): # Simulates blocking I/O import time time.sleep(2) return "Done" async def main(): loop = asyncio.get_event_loop() # Run in thread pool result = await loop.run_in_executor(None, blocking_io) print(result) # With custom executor with ThreadPoolExecutor(max_workers=4) as executor: result = await loop.run_in_executor(executor, blocking_io) asyncio.run(main()) Periodic Tasks 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 async def periodic_task(interval, func): while True: await func() await asyncio.sleep(interval) async def heartbeat(): print("Heartbeat") async def main(): # Start periodic task in background task = asyncio.create_task(periodic_task(5, heartbeat)) # Do other work await asyncio.sleep(20) # Cancel when done task.cancel() try: await task except asyncio.CancelledError: print("Periodic task cancelled") asyncio.run(main()) Graceful Shutdown 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 import signal async def shutdown(signal, loop): print(f"Received {signal.name}") tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] for task in tasks: task.cancel() await asyncio.gather(*tasks, return_exceptions=True) loop.stop() async def main(): loop = asyncio.get_event_loop() for sig in (signal.SIGTERM, signal.SIGINT): loop.add_signal_handler( sig, lambda s=sig: asyncio.create_task(shutdown(s, loop)) ) # Your long-running tasks here await asyncio.sleep(3600) asyncio.run(main()) Common Pitfalls Don’t Block the Event Loop 1 2 3 4 5 6 7 8 9 # BAD - blocks entire event loop async def bad(): time.sleep(5) # Blocking! return "done" # GOOD - use async sleep or run_in_executor async def good(): await asyncio.sleep(5) return "done" Don’t Forget to Await 1 2 3 4 5 6 7 # BAD - coroutine never runs async def main(): fetch_data() # Missing await! # GOOD async def main(): await fetch_data() Create Tasks Properly 1 2 3 4 5 6 7 8 9 # BAD - task may be garbage collected async def main(): asyncio.create_task(background_work()) # Task might not complete # GOOD - keep reference async def main(): task = asyncio.create_task(background_work()) await task # or store in set Don’t Mix Sync and Async 1 2 3 4 5 6 7 # BAD - calling async from sync incorrectly def sync_function(): result = async_function() # Returns coroutine, not result # GOOD - use asyncio.run or run_in_executor def sync_function(): result = asyncio.run(async_function()) Testing Async Code 1 2 3 4 5 6 7 8 9 10 11 12 13 import pytest import asyncio @pytest.mark.asyncio async def test_async_function(): result = await my_async_function() assert result == expected # Or with unittest class TestAsync(unittest.IsolatedAsyncioTestCase): async def test_something(self): result = await my_async_function() self.assertEqual(result, expected) Quick Reference 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 # Run async function asyncio.run(main()) # Concurrent execution await asyncio.gather(task1(), task2()) # Create background task task = asyncio.create_task(coro()) # Timeout await asyncio.wait_for(coro(), timeout=5.0) # Limit concurrency semaphore = asyncio.Semaphore(10) async with semaphore: ... # Run blocking code await loop.run_in_executor(None, blocking_func) # Sleep await asyncio.sleep(1) Asyncio shines for I/O-bound workloadsβ€”HTTP requests, database queries, file operations. It won’t help with CPU-bound work (use multiprocessing for that). ...

February 25, 2026 Β· 6 min Β· 1194 words Β· Rob Washington

Kubernetes Debugging: From Pod Failures to Cluster Issues

Kubernetes abstracts away infrastructure until something breaks. Then you need to peel back the layers. These debugging patterns will help you find problems fast. First Steps: Get the Lay of the Land 1 2 3 4 5 6 7 8 9 10 # Cluster health kubectl cluster-info kubectl get nodes kubectl top nodes # Namespace overview kubectl get all -n myapp # Events (recent issues surface here) kubectl get events -n myapp --sort-by='.lastTimestamp' Pod Debugging Check Pod Status 1 2 3 4 5 6 7 8 9 10 11 12 13 # List pods with status kubectl get pods -n myapp # Detailed pod info kubectl describe pod mypod -n myapp # Common status meanings: # Pending - Waiting for scheduling or image pull # Running - At least one container running # Succeeded - All containers completed successfully # Failed - All containers terminated, at least one failed # CrashLoopBackOff - Container crashing repeatedly # ImagePullBackOff - Can't pull container image View Logs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # Current logs kubectl logs mypod -n myapp # Previous container (after crash) kubectl logs mypod -n myapp --previous # Follow logs kubectl logs -f mypod -n myapp # Specific container (multi-container pod) kubectl logs mypod -n myapp -c mycontainer # Last N lines kubectl logs mypod -n myapp --tail=100 # Since timestamp kubectl logs mypod -n myapp --since=1h Execute Commands in Pod 1 2 3 4 5 6 7 8 # Shell into running container kubectl exec -it mypod -n myapp -- /bin/sh # Run specific command kubectl exec mypod -n myapp -- cat /etc/config/app.yaml # Specific container kubectl exec -it mypod -n myapp -c mycontainer -- /bin/sh Debug Crashed Containers 1 2 3 4 5 6 7 8 # Check why it crashed kubectl describe pod mypod -n myapp | grep -A 10 "Last State" # View previous logs kubectl logs mypod -n myapp --previous # Run debug container (K8s 1.25+) kubectl debug mypod -n myapp -it --image=busybox --target=mycontainer Common Pod Issues ImagePullBackOff 1 2 3 4 5 6 7 8 9 10 11 12 13 # Check events for details kubectl describe pod mypod -n myapp | grep -A 5 Events # Common causes: # - Wrong image name/tag # - Private registry without imagePullSecrets # - Registry rate limiting (Docker Hub) # Verify image exists docker pull myimage:tag # Check imagePullSecrets kubectl get pod mypod -n myapp -o jsonpath='{.spec.imagePullSecrets}' CrashLoopBackOff 1 2 3 4 5 6 7 8 9 10 11 12 # Get exit code kubectl describe pod mypod -n myapp | grep "Exit Code" # Exit codes: # 0 - Success (shouldn't be crashing) # 1 - Application error # 137 - OOMKilled (out of memory) # 139 - Segmentation fault # 143 - SIGTERM (graceful shutdown) # Check resource limits kubectl describe pod mypod -n myapp | grep -A 5 Limits Pending Pods 1 2 3 4 5 6 7 8 9 10 11 # Check why not scheduled kubectl describe pod mypod -n myapp | grep -A 10 Events # Common causes: # - Insufficient resources # - Node selector/affinity not matched # - Taints without tolerations # - PVC not bound # Check node resources kubectl describe nodes | grep -A 5 "Allocated resources" Resource Issues Memory Problems 1 2 3 4 5 6 7 8 # Check pod resource usage kubectl top pod mypod -n myapp # Check for OOMKilled kubectl describe pod mypod -n myapp | grep OOMKilled # View memory limits kubectl get pod mypod -n myapp -o jsonpath='{.spec.containers[*].resources}' CPU Throttling 1 2 3 4 5 # Check CPU usage vs limits kubectl top pod mypod -n myapp # In container, check throttling kubectl exec mypod -n myapp -- cat /sys/fs/cgroup/cpu/cpu.stat Networking Debugging Service Connectivity 1 2 3 4 5 6 7 8 9 10 11 12 # Check service exists kubectl get svc -n myapp # Check endpoints (are pods backing the service?) kubectl get endpoints myservice -n myapp # Test from within cluster kubectl run debug --rm -it --image=busybox -- /bin/sh # Then: wget -qO- http://myservice.myapp.svc.cluster.local # DNS resolution kubectl run debug --rm -it --image=busybox -- nslookup myservice.myapp.svc.cluster.local Pod-to-Pod Communication 1 2 3 4 5 6 7 8 # Get pod IPs kubectl get pods -n myapp -o wide # Test connectivity from one pod to another kubectl exec mypod1 -n myapp -- wget -qO- http://10.0.0.5:8080 # Check network policies kubectl get networkpolicies -n myapp Ingress Issues 1 2 3 4 5 6 7 8 # Check ingress configuration kubectl describe ingress myingress -n myapp # Check ingress controller logs kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx # Verify backend service kubectl get svc myservice -n myapp ConfigMaps and Secrets 1 2 3 4 5 6 7 8 9 10 11 12 # Verify ConfigMap exists and has expected data kubectl get configmap myconfig -n myapp -o yaml # Check if mounted correctly kubectl exec mypod -n myapp -- ls -la /etc/config/ kubectl exec mypod -n myapp -- cat /etc/config/app.yaml # Verify Secret kubectl get secret mysecret -n myapp -o jsonpath='{.data.password}' | base64 -d # Check environment variables kubectl exec mypod -n myapp -- env | grep MY_VAR Persistent Volumes 1 2 3 4 5 6 7 8 9 10 11 12 # Check PVC status kubectl get pvc -n myapp # Describe for binding issues kubectl describe pvc mypvc -n myapp # Check PV kubectl get pv # Verify mount in pod kubectl exec mypod -n myapp -- df -h kubectl exec mypod -n myapp -- ls -la /data Node Issues 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # Node status kubectl get nodes kubectl describe node mynode # Check conditions kubectl get nodes -o custom-columns=NAME:.metadata.name,CONDITIONS:.status.conditions[*].type # Node resource pressure kubectl describe node mynode | grep -A 5 Conditions # Pods on specific node kubectl get pods --all-namespaces -o wide --field-selector spec.nodeName=mynode # Drain node for maintenance kubectl drain mynode --ignore-daemonsets --delete-emptydir-data Control Plane Debugging 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # API server health kubectl get --raw='/healthz' # Component status (deprecated but useful) kubectl get componentstatuses # Check system pods kubectl get pods -n kube-system # API server logs (if accessible) kubectl logs -n kube-system kube-apiserver-master # etcd health kubectl exec -n kube-system etcd-master -- etcdctl endpoint health Useful Debug Containers 1 2 3 4 5 6 7 8 # Network debugging kubectl run netdebug --rm -it --image=nicolaka/netshoot -- /bin/bash # DNS debugging kubectl run dnsdebug --rm -it --image=tutum/dnsutils -- /bin/bash # General debugging kubectl run debug --rm -it --image=busybox -- /bin/sh Systematic Debugging Checklist Events first: kubectl get events --sort-by='.lastTimestamp' Describe the resource: kubectl describe <resource> <name> Check logs: kubectl logs <pod> (and --previous) Verify dependencies: ConfigMaps, Secrets, Services, PVCs Check resources: CPU, memory limits and usage Test connectivity: DNS, service endpoints, network policies Compare with working: Diff against known good configuration Quick Reference 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # Pod not starting kubectl describe pod <name> kubectl get events # Pod crashing kubectl logs <pod> --previous kubectl describe pod <name> | grep "Exit Code" # Can't connect to service kubectl get endpoints <service> kubectl run debug --rm -it --image=busybox -- wget -qO- http://<service> # Resource issues kubectl top pods kubectl describe node | grep -A 5 "Allocated" # Config issues kubectl exec <pod> -- env kubectl exec <pod> -- cat /path/to/config Kubernetes debugging is methodical. Start with events, drill into describe output, check logs, and verify each dependency. Most issues are configuration mismatchesβ€”wrong image tags, missing secrets, insufficient resources. ...

February 25, 2026 Β· 7 min Β· 1300 words Β· Rob Washington

LLM API Integration Patterns: Building Reliable AI-Powered Applications

Integrating LLM APIs into production applications requires more than just making API calls. These patterns address the real challenges: rate limits, token costs, latency, and reliability. Basic Client Setup 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 import os from anthropic import Anthropic client = Anthropic( api_key=os.environ.get("ANTHROPIC_API_KEY"), timeout=60.0, max_retries=3, ) def chat(message: str, system: str = None) -> str: """Simple completion with sensible defaults.""" messages = [{"role": "user", "content": message}] response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, system=system or "You are a helpful assistant.", messages=messages, ) return response.content[0].text Retry with Exponential Backoff Built-in retries help, but custom logic handles edge cases: ...

February 25, 2026 Β· 7 min Β· 1291 words Β· Rob Washington

AWS CLI Essentials: Patterns for Daily Operations

The AWS CLI is the fastest path from question to answer. These patterns cover the operations you’ll use daily. Setup and Configuration 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # Configure default profile aws configure # Configure named profile aws configure --profile production # Use specific profile aws --profile production ec2 describe-instances # Or set environment variable export AWS_PROFILE=production # Verify identity aws sts get-caller-identity Multiple Accounts 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # ~/.aws/credentials [default] aws_access_key_id = AKIA... aws_secret_access_key = ... [production] aws_access_key_id = AKIA... aws_secret_access_key = ... # ~/.aws/config [default] region = us-east-1 output = json [profile production] region = us-west-2 output = json EC2 Operations List Instances 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # All instances aws ec2 describe-instances # Just the essentials aws ec2 describe-instances \ --query 'Reservations[].Instances[].[InstanceId,State.Name,InstanceType,PrivateIpAddress,Tags[?Key==`Name`].Value|[0]]' \ --output table # Running instances only aws ec2 describe-instances \ --filters "Name=instance-state-name,Values=running" \ --query 'Reservations[].Instances[].[InstanceId,PrivateIpAddress]' \ --output text # By tag aws ec2 describe-instances \ --filters "Name=tag:Environment,Values=production" # By instance ID aws ec2 describe-instances --instance-ids i-1234567890abcdef0 Start/Stop/Reboot 1 2 3 4 5 6 7 8 9 10 11 # Stop aws ec2 stop-instances --instance-ids i-1234567890abcdef0 # Start aws ec2 start-instances --instance-ids i-1234567890abcdef0 # Reboot aws ec2 reboot-instances --instance-ids i-1234567890abcdef0 # Terminate (careful!) aws ec2 terminate-instances --instance-ids i-1234567890abcdef0 Get Console Output 1 aws ec2 get-console-output --instance-id i-1234567890abcdef0 --output text SSH Key Pairs 1 2 3 4 5 6 7 8 9 # List aws ec2 describe-key-pairs # Create aws ec2 create-key-pair --key-name mykey --query 'KeyMaterial' --output text > mykey.pem chmod 400 mykey.pem # Delete aws ec2 delete-key-pair --key-name mykey S3 Operations List and Navigate 1 2 3 4 5 6 7 8 9 10 11 # List buckets aws s3 ls # List bucket contents aws s3 ls s3://mybucket/ # Recursive listing aws s3 ls s3://mybucket/ --recursive # With human-readable sizes aws s3 ls s3://mybucket/ --recursive --human-readable --summarize Copy and Sync 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # Upload file aws s3 cp myfile.txt s3://mybucket/ # Download file aws s3 cp s3://mybucket/myfile.txt ./ # Upload directory aws s3 cp ./mydir s3://mybucket/mydir --recursive # Sync (only changed files) aws s3 sync ./local s3://mybucket/remote # Sync with delete (mirror) aws s3 sync ./local s3://mybucket/remote --delete # Exclude patterns aws s3 sync ./local s3://mybucket/remote --exclude "*.log" --exclude ".git/*" Delete 1 2 3 4 5 6 7 8 # Single file aws s3 rm s3://mybucket/myfile.txt # Directory aws s3 rm s3://mybucket/mydir/ --recursive # Empty bucket aws s3 rm s3://mybucket/ --recursive Presigned URLs 1 2 3 4 5 # Generate download URL (expires in 1 hour) aws s3 presign s3://mybucket/myfile.txt --expires-in 3600 # Upload URL aws s3 presign s3://mybucket/upload.txt --expires-in 3600 Bucket Operations 1 2 3 4 5 6 7 8 # Create bucket aws s3 mb s3://mynewbucket # Delete bucket (must be empty) aws s3 rb s3://mybucket # Force delete (removes contents first) aws s3 rb s3://mybucket --force IAM Operations Users 1 2 3 4 5 6 7 8 9 10 11 # List users aws iam list-users # Create user aws iam create-user --user-name newuser # Delete user aws iam delete-user --user-name olduser # List user's access keys aws iam list-access-keys --user-name myuser Roles 1 2 3 4 5 6 7 8 # List roles aws iam list-roles # Get role details aws iam get-role --role-name MyRole # List attached policies aws iam list-attached-role-policies --role-name MyRole Policies 1 2 3 4 5 6 7 # List policies aws iam list-policies --scope Local # Get policy document aws iam get-policy-version \ --policy-arn arn:aws:iam::123456789012:policy/MyPolicy \ --version-id v1 CloudWatch Logs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # List log groups aws logs describe-log-groups # List log streams aws logs describe-log-streams --log-group-name /aws/lambda/myfunction # Get recent logs aws logs get-log-events \ --log-group-name /aws/lambda/myfunction \ --log-stream-name '2024/01/01/[$LATEST]abc123' \ --limit 100 # Tail logs (requires aws-cli v2) aws logs tail /aws/lambda/myfunction --follow # Filter logs aws logs filter-log-events \ --log-group-name /aws/lambda/myfunction \ --filter-pattern "ERROR" \ --start-time $(date -d '1 hour ago' +%s)000 Lambda 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # List functions aws lambda list-functions # Invoke function aws lambda invoke \ --function-name myfunction \ --payload '{"key": "value"}' \ output.json # Get function config aws lambda get-function-configuration --function-name myfunction # Update function code aws lambda update-function-code \ --function-name myfunction \ --zip-file fileb://function.zip # View recent invocations aws lambda get-function --function-name myfunction RDS 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # List instances aws rds describe-db-instances # Instance details aws rds describe-db-instances --db-instance-identifier mydb \ --query 'DBInstances[0].[DBInstanceIdentifier,DBInstanceStatus,Endpoint.Address]' # Create snapshot aws rds create-db-snapshot \ --db-instance-identifier mydb \ --db-snapshot-identifier mydb-snapshot-$(date +%Y%m%d) # List snapshots aws rds describe-db-snapshots --db-instance-identifier mydb Secrets Manager 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # List secrets aws secretsmanager list-secrets # Get secret value aws secretsmanager get-secret-value --secret-id mysecret \ --query 'SecretString' --output text # Create secret aws secretsmanager create-secret \ --name mysecret \ --secret-string '{"username":"admin","password":"secret"}' # Update secret aws secretsmanager put-secret-value \ --secret-id mysecret \ --secret-string '{"username":"admin","password":"newsecret"}' SSM Parameter Store 1 2 3 4 5 6 7 8 9 10 11 12 # Get parameter aws ssm get-parameter --name /myapp/database/password --with-decryption # Put parameter aws ssm put-parameter \ --name /myapp/database/password \ --value "mysecret" \ --type SecureString \ --overwrite # List parameters by path aws ssm get-parameters-by-path --path /myapp/ --recursive --with-decryption Query and Filter Patterns JMESPath Queries 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # Select specific fields aws ec2 describe-instances \ --query 'Reservations[].Instances[].[InstanceId,State.Name]' # Filter in query aws ec2 describe-instances \ --query 'Reservations[].Instances[?State.Name==`running`].[InstanceId]' # First result only aws ec2 describe-instances \ --query 'Reservations[0].Instances[0].InstanceId' # Flatten nested arrays aws ec2 describe-instances \ --query 'Reservations[].Instances[].Tags[?Key==`Name`].Value[]' Output Formats 1 2 3 4 5 6 7 8 9 10 11 # JSON (default) aws ec2 describe-instances --output json # Table (human readable) aws ec2 describe-instances --output table # Text (tab-separated, good for scripts) aws ec2 describe-instances --output text # YAML aws ec2 describe-instances --output yaml Scripting Patterns Loop Through Resources 1 2 3 4 5 6 7 8 # Stop all instances with specific tag for id in $(aws ec2 describe-instances \ --filters "Name=tag:Environment,Values=dev" \ --query 'Reservations[].Instances[].InstanceId' \ --output text); do echo "Stopping $id" aws ec2 stop-instances --instance-ids "$id" done Wait for State 1 2 3 4 5 6 7 8 # Wait for instance to be running aws ec2 wait instance-running --instance-ids i-1234567890abcdef0 # Wait for instance to stop aws ec2 wait instance-stopped --instance-ids i-1234567890abcdef0 # Wait for snapshot completion aws ec2 wait snapshot-completed --snapshot-ids snap-1234567890abcdef0 Pagination 1 2 3 4 5 6 7 # Auto-pagination (default in CLI v2) aws s3api list-objects-v2 --bucket mybucket # Manual pagination aws s3api list-objects-v2 --bucket mybucket --max-items 100 # Use NextToken from output for next page aws s3api list-objects-v2 --bucket mybucket --starting-token "token..." Useful Aliases 1 2 3 4 5 6 7 8 9 10 # ~/.bashrc or ~/.zshrc # Quick instance list alias ec2ls='aws ec2 describe-instances --query "Reservations[].Instances[].[InstanceId,State.Name,InstanceType,PrivateIpAddress,Tags[?Key==\`Name\`].Value|[0]]" --output table' # Who am I? alias awswho='aws sts get-caller-identity' # S3 bucket sizes alias s3sizes='aws s3 ls | while read _ _ bucket; do aws s3 ls s3://$bucket --recursive --summarize 2>/dev/null | tail -1; echo $bucket; done' Troubleshooting 1 2 3 4 5 6 7 8 9 10 11 # Debug mode aws ec2 describe-instances --debug # Dry run (check permissions without executing) aws ec2 run-instances --dry-run --image-id ami-12345 --instance-type t2.micro # Check CLI version aws --version # Clear credential cache rm -rf ~/.aws/cli/cache/* The AWS CLI rewards muscle memory. Start with the operations you do daily, build aliases for common patterns, and gradually expand. ...

February 25, 2026 Β· 7 min Β· 1350 words Β· Rob Washington

Linux Process Management: From ps to Process Trees

Understanding processes is fundamental to Linux troubleshooting. These tools and techniques will help you find what’s running, what’s stuck, and what needs to die. Viewing Processes ps - Process Snapshot 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # All processes (BSD style) ps aux # All processes (Unix style) ps -ef # Process tree ps auxf # Specific columns ps -eo pid,ppid,user,%cpu,%mem,stat,cmd # Find specific process ps aux | grep nginx # By exact name (no grep needed) ps -C nginx # By user ps -u www-data Understanding ps Output U r w S o w E o w R t - d a t a 1 P 2 I 3 D 1 4 % C 0 2 P . . U 0 5 % M 0 1 E . . M 1 2 1 4 6 5 9 6 V 9 7 S 3 8 Z 6 9 1 9 3 8 R 2 7 S 5 6 S 6 5 T ? ? T Y S S S T s l A T S F 1 T e 0 A b : R 2 0 T 4 0 T 0 5 I : : M 0 2 E 3 3 C / n O s g M b i M i n A n x N / : D i n w i o t r k e r PID: Process ID %CPU: CPU usage %MEM: Memory usage VSZ: Virtual memory size RSS: Resident set size (actual RAM) STAT: Process state TIME: CPU time consumed Process States (STAT) R S D Z T N s l R S S Z S H L S M F u l l o t i o e u o n e e m o g w s l r n e e b p h s t e i p p i p p i i g n i i e e p r o - r g n n d r i n t o g g i o h u o r l r n ( ( r i e e d i u i t a a n n t y d d p t i y e e r e n r d o r t c r e e u r s p r s t u i p b t l i e b ) l e , u s u a l l y I / O ) top - Real-time View 1 2 3 4 5 6 7 8 9 10 11 # Basic top # Sort by memory top -o %MEM # Specific user top -u www-data # Batch mode (for scripts) top -b -n 1 Inside top: ...

February 25, 2026 Β· 9 min Β· 1733 words Β· Rob Washington

Terraform State Management: Patterns for Teams

Terraform state is the source of truth for your infrastructure. Mismanage it, and you’ll have drift, conflicts, and 3 AM incidents. These patterns keep state safe and teams productive. Why State Matters Terraform state maps your configuration to real resources. Without it, Terraform can’t: Know what already exists Calculate what needs to change Detect drift from desired state Default local state (terraform.tfstate) breaks down quickly: Can’t collaborate (who has the latest?) No locking (concurrent runs = corruption) No history (oops, we deleted production) Remote Backend: S3 + DynamoDB The standard pattern for AWS teams: ...

February 25, 2026 Β· 7 min Β· 1487 words Β· Rob Washington

Nginx Configuration: From Basics to Production Hardening

Nginx powers a significant portion of the internet. These configurations will help you move from default installs to production-ready setups. Basic Structure 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 # /etc/nginx/nginx.conf user www-data; worker_processes auto; pid /run/nginx.pid; events { worker_connections 1024; multi_accept on; } http { include /etc/nginx/mime.types; default_type application/octet-stream; # Logging access_log /var/log/nginx/access.log; error_log /var/log/nginx/error.log; # Performance sendfile on; tcp_nopush on; tcp_nodelay on; keepalive_timeout 65; # Gzip gzip on; gzip_types text/plain text/css application/json application/javascript; # Virtual hosts include /etc/nginx/conf.d/*.conf; include /etc/nginx/sites-enabled/*; } Basic Server Block 1 2 3 4 5 6 7 8 9 10 11 12 13 # /etc/nginx/sites-available/example.com server { listen 80; listen [::]:80; server_name example.com www.example.com; root /var/www/example.com; index index.html; location / { try_files $uri $uri/ =404; } } SSL/TLS Configuration 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 server { listen 443 ssl http2; listen [::]:443 ssl http2; server_name example.com; # Certificates ssl_certificate /etc/letsencrypt/live/example.com/fullchain.pem; ssl_certificate_key /etc/letsencrypt/live/example.com/privkey.pem; # Modern SSL configuration ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256; ssl_prefer_server_ciphers off; # SSL session caching ssl_session_cache shared:SSL:10m; ssl_session_timeout 1d; ssl_session_tickets off; # OCSP stapling ssl_stapling on; ssl_stapling_verify on; resolver 8.8.8.8 8.8.4.4 valid=300s; root /var/www/example.com; } # Redirect HTTP to HTTPS server { listen 80; listen [::]:80; server_name example.com www.example.com; return 301 https://$server_name$request_uri; } Reverse Proxy Basic Proxy 1 2 3 4 5 6 7 8 9 10 11 12 13 server { listen 80; server_name api.example.com; location / { proxy_pass http://localhost:3000; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; } } WebSocket Support 1 2 3 4 5 6 7 8 location /ws { proxy_pass http://localhost:3000; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; proxy_set_header Host $host; proxy_read_timeout 86400; } Proxy with Buffering Control 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 location /api { proxy_pass http://backend; # Disable buffering for streaming proxy_buffering off; # Or tune buffer sizes proxy_buffer_size 4k; proxy_buffers 8 4k; proxy_busy_buffers_size 8k; # Timeouts proxy_connect_timeout 60s; proxy_send_timeout 60s; proxy_read_timeout 60s; } Load Balancing 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 upstream backend { # Round robin (default) server 10.0.0.1:3000; server 10.0.0.2:3000; server 10.0.0.3:3000; # Weighted server 10.0.0.1:3000 weight=3; server 10.0.0.2:3000 weight=1; # Backup server server 10.0.0.4:3000 backup; # Health checks server 10.0.0.1:3000 max_fails=3 fail_timeout=30s; } # Alternative algorithms upstream backend_least_conn { least_conn; server 10.0.0.1:3000; server 10.0.0.2:3000; } upstream backend_ip_hash { ip_hash; # Session persistence server 10.0.0.1:3000; server 10.0.0.2:3000; } server { location / { proxy_pass http://backend; } } Caching Proxy Cache 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 http { # Define cache zone proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=my_cache:10m max_size=1g inactive=60m use_temp_path=off; server { location / { proxy_pass http://backend; proxy_cache my_cache; proxy_cache_valid 200 60m; proxy_cache_valid 404 1m; proxy_cache_use_stale error timeout updating; # Cache key proxy_cache_key "$scheme$request_method$host$request_uri"; # Add header to show cache status add_header X-Cache-Status $upstream_cache_status; } # Bypass cache for specific requests location /api { proxy_pass http://backend; proxy_cache my_cache; proxy_cache_bypass $http_authorization; proxy_no_cache $http_authorization; } } } Static File Caching 1 2 3 4 5 location ~* \.(jpg|jpeg|png|gif|ico|css|js|woff2)$ { expires 30d; add_header Cache-Control "public, immutable"; access_log off; } Rate Limiting 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 http { # Define rate limit zones limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s; limit_req_zone $binary_remote_addr zone=login_limit:10m rate=1r/s; server { # Apply to API endpoints location /api/ { limit_req zone=api_limit burst=20 nodelay; proxy_pass http://backend; } # Stricter limit for login location /login { limit_req zone=login_limit burst=5; proxy_pass http://backend; } } } Connection Limiting 1 2 3 4 5 6 7 http { limit_conn_zone $binary_remote_addr zone=conn_limit:10m; server { limit_conn conn_limit 10; # Max 10 connections per IP } } Security Headers 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 server { # Security headers add_header X-Frame-Options "SAMEORIGIN" always; add_header X-Content-Type-Options "nosniff" always; add_header X-XSS-Protection "1; mode=block" always; add_header Referrer-Policy "strict-origin-when-cross-origin" always; add_header Content-Security-Policy "default-src 'self';" always; # HSTS (only enable after confirming SSL works) add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; # Hide nginx version server_tokens off; # Prevent access to hidden files location ~ /\. { deny all; } } Access Control IP-based 1 2 3 4 5 6 7 location /admin { allow 10.0.0.0/8; allow 192.168.1.0/24; deny all; proxy_pass http://backend; } Basic Authentication 1 2 3 4 5 6 location /admin { auth_basic "Admin Area"; auth_basic_user_file /etc/nginx/.htpasswd; proxy_pass http://backend; } Create password file: ...

February 25, 2026 Β· 6 min Β· 1232 words Β· Rob Washington

Redis Patterns: Beyond Simple Key-Value Caching

Redis is often introduced as β€œjust a cache,” but it’s a versatile data structure server. These patterns unlock its full potential. Connection Basics 1 2 3 4 5 6 7 8 9 10 11 # Connect redis-cli -h localhost -p 6379 # With password redis-cli -h localhost -p 6379 -a yourpassword # Select database (0-15) SELECT 1 # Check connectivity PING Caching Patterns Basic Cache with TTL 1 2 3 4 5 6 7 8 9 10 11 # Set with expiration (seconds) SET user:123:profile '{"name":"Alice"}' EX 3600 # Set with expiration (milliseconds) SET session:abc123 '{"user_id":123}' PX 86400000 # Set only if not exists SETNX cache:key "value" # Set only if exists (update) SET cache:key "newvalue" XX Cache-Aside Pattern 1 2 3 4 5 6 7 8 9 10 11 12 def get_user(user_id): # Check cache first cached = redis.get(f"user:{user_id}") if cached: return json.loads(cached) # Cache miss - fetch from database user = db.query("SELECT * FROM users WHERE id = %s", user_id) # Store in cache redis.setex(f"user:{user_id}", 3600, json.dumps(user)) return user Write-Through Pattern 1 2 3 4 5 6 def update_user(user_id, data): # Update database db.execute("UPDATE users SET ... WHERE id = %s", user_id) # Update cache immediately redis.setex(f"user:{user_id}", 3600, json.dumps(data)) Cache Stampede Prevention 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 def get_with_lock(key, fetch_func, ttl=3600, lock_ttl=10): value = redis.get(key) if value: return json.loads(value) lock_key = f"lock:{key}" # Try to acquire lock if redis.set(lock_key, "1", nx=True, ex=lock_ttl): try: value = fetch_func() redis.setex(key, ttl, json.dumps(value)) return value finally: redis.delete(lock_key) else: # Another process is fetching, wait and retry time.sleep(0.1) return get_with_lock(key, fetch_func, ttl, lock_ttl) Session Storage 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import secrets def create_session(user_id, ttl=86400): session_id = secrets.token_urlsafe(32) session_data = { "user_id": user_id, "created_at": time.time() } redis.setex(f"session:{session_id}", ttl, json.dumps(session_data)) return session_id def get_session(session_id): data = redis.get(f"session:{session_id}") return json.loads(data) if data else None def extend_session(session_id, ttl=86400): redis.expire(f"session:{session_id}", ttl) def destroy_session(session_id): redis.delete(f"session:{session_id}") Rate Limiting Fixed Window 1 2 3 4 5 6 7 8 def is_rate_limited(user_id, limit=100, window=60): key = f"ratelimit:{user_id}:{int(time.time() // window)}" current = redis.incr(key) if current == 1: redis.expire(key, window) return current > limit Sliding Window with Sorted Sets 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 def is_rate_limited_sliding(user_id, limit=100, window=60): key = f"ratelimit:{user_id}" now = time.time() window_start = now - window pipe = redis.pipeline() # Remove old entries pipe.zremrangebyscore(key, 0, window_start) # Add current request pipe.zadd(key, {str(now): now}) # Count requests in window pipe.zcard(key) # Set expiration pipe.expire(key, window) results = pipe.execute() request_count = results[2] return request_count > limit Token Bucket 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 def check_token_bucket(user_id, capacity=10, refill_rate=1): key = f"bucket:{user_id}" now = time.time() # Get current state data = redis.hgetall(key) if data: tokens = float(data[b'tokens']) last_update = float(data[b'last_update']) # Refill tokens based on elapsed time elapsed = now - last_update tokens = min(capacity, tokens + elapsed * refill_rate) else: tokens = capacity if tokens >= 1: # Consume a token redis.hset(key, mapping={ 'tokens': tokens - 1, 'last_update': now }) redis.expire(key, int(capacity / refill_rate) + 1) return True return False Queues and Pub/Sub Simple Queue with Lists 1 2 3 4 5 6 7 8 9 10 # Producer def enqueue(queue_name, message): redis.lpush(queue_name, json.dumps(message)) # Consumer (blocking) def dequeue(queue_name, timeout=0): result = redis.brpop(queue_name, timeout) if result: return json.loads(result[1]) return None Reliable Queue with RPOPLPUSH 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def reliable_dequeue(queue_name, processing_queue): # Move item to processing queue atomically item = redis.rpoplpush(queue_name, processing_queue) return json.loads(item) if item else None def ack(processing_queue, item): # Remove from processing queue when done redis.lrem(processing_queue, 1, json.dumps(item)) def requeue_failed(processing_queue, queue_name): # Move failed items back to main queue while True: item = redis.rpoplpush(processing_queue, queue_name) if not item: break Pub/Sub 1 2 3 4 5 6 7 8 9 10 11 12 # Publisher def publish_event(channel, event): redis.publish(channel, json.dumps(event)) # Subscriber def subscribe(channel, callback): pubsub = redis.pubsub() pubsub.subscribe(channel) for message in pubsub.listen(): if message['type'] == 'message': callback(json.loads(message['data'])) Leaderboards with Sorted Sets 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 def add_score(leaderboard, user_id, score): redis.zadd(leaderboard, {user_id: score}) def increment_score(leaderboard, user_id, amount): redis.zincrby(leaderboard, amount, user_id) def get_rank(leaderboard, user_id): # 0-indexed, reverse order (highest first) rank = redis.zrevrank(leaderboard, user_id) return rank + 1 if rank is not None else None def get_top(leaderboard, count=10): return redis.zrevrange(leaderboard, 0, count - 1, withscores=True) def get_around_user(leaderboard, user_id, count=5): rank = redis.zrevrank(leaderboard, user_id) if rank is None: return [] start = max(0, rank - count) end = rank + count return redis.zrevrange(leaderboard, start, end, withscores=True) Distributed Locks 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 import uuid class RedisLock: def __init__(self, redis_client, key, ttl=10): self.redis = redis_client self.key = f"lock:{key}" self.ttl = ttl self.token = str(uuid.uuid4()) def acquire(self, blocking=True, timeout=None): start = time.time() while True: if self.redis.set(self.key, self.token, nx=True, ex=self.ttl): return True if not blocking: return False if timeout and (time.time() - start) > timeout: return False time.sleep(0.1) def release(self): # Only release if we own the lock script = """ if redis.call("get", KEYS[1]) == ARGV[1] then return redis.call("del", KEYS[1]) else return 0 end """ self.redis.eval(script, 1, self.key, self.token) def __enter__(self): self.acquire() return self def __exit__(self, *args): self.release() # Usage with RedisLock(redis, "my-resource"): # Critical section do_work() Counting and Analytics HyperLogLog for Unique Counts 1 2 3 4 5 6 7 8 9 10 11 # Count unique visitors (memory efficient) def track_visitor(page, visitor_id): redis.pfadd(f"visitors:{page}:{date.today()}", visitor_id) def get_unique_visitors(page, date): return redis.pfcount(f"visitors:{page}:{date}") # Merge multiple days def get_weekly_uniques(page): keys = [f"visitors:{page}:{date}" for date in last_7_days()] return redis.pfcount(*keys) Bitmaps for Daily Active Users 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def mark_active(user_id, date=None): date = date or date.today().isoformat() redis.setbit(f"active:{date}", user_id, 1) def was_active(user_id, date): return redis.getbit(f"active:{date}", user_id) == 1 def count_active(date): return redis.bitcount(f"active:{date}") # Users active on multiple days def active_all_days(dates): keys = [f"active:{d}" for d in dates] result_key = "temp:active_intersection" redis.bitop("AND", result_key, *keys) count = redis.bitcount(result_key) redis.delete(result_key) return count Expiration Strategies 1 2 3 4 5 6 7 8 9 10 11 12 # Set TTL EXPIRE key 3600 EXPIREAT key 1735689600 # Unix timestamp # Check TTL TTL key # Returns -1 if no expiry, -2 if doesn't exist # Remove expiration PERSIST key # Set value and TTL atomically SETEX key 3600 "value" Lazy Expiration Pattern 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def get_with_soft_expire(key, ttl=3600, soft_ttl=300): """ Returns cached value but triggers background refresh if within soft_ttl of expiration. """ pipe = redis.pipeline() pipe.get(key) pipe.ttl(key) value, remaining_ttl = pipe.execute() if value and remaining_ttl < soft_ttl: # Trigger async refresh refresh_cache_async.delay(key) return value Transactions and Lua Scripts Pipeline (Batching) 1 2 3 4 pipe = redis.pipeline() for i in range(1000): pipe.set(f"key:{i}", f"value:{i}") pipe.execute() # Single round trip Transaction with WATCH 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 def transfer(from_account, to_account, amount): with redis.pipeline() as pipe: while True: try: # Watch for changes pipe.watch(from_account, to_account) from_balance = int(pipe.get(from_account) or 0) if from_balance < amount: pipe.unwatch() return False # Start transaction pipe.multi() pipe.decrby(from_account, amount) pipe.incrby(to_account, amount) pipe.execute() return True except redis.WatchError: # Retry if watched keys changed continue Lua Script (Atomic Operations) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 # Rate limiter as Lua script RATE_LIMIT_SCRIPT = """ local key = KEYS[1] local limit = tonumber(ARGV[1]) local window = tonumber(ARGV[2]) local current = redis.call('INCR', key) if current == 1 then redis.call('EXPIRE', key, window) end if current > limit then return 0 else return 1 end """ rate_limit = redis.register_script(RATE_LIMIT_SCRIPT) def check_rate_limit(user_id, limit=100, window=60): key = f"ratelimit:{user_id}:{int(time.time() // window)}" return rate_limit(keys=[key], args=[limit, window]) == 1 Monitoring 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # Real-time commands MONITOR # Stats INFO INFO memory INFO stats # Slow queries SLOWLOG GET 10 # Connected clients CLIENT LIST # Memory usage for a key MEMORY USAGE mykey Redis excels when you match the right data structure to your problem. Lists for queues, sorted sets for leaderboards, HyperLogLog for counting uniquesβ€”each has its sweet spot. ...

February 25, 2026 Β· 7 min Β· 1467 words Β· Rob Washington

PostgreSQL Performance Tuning: From Slow Queries to Snappy Responses

PostgreSQL is fast out of the box. But β€œfast enough for development” and β€œfast enough for production” are different conversations. These techniques will help you find and fix performance bottlenecks. Finding Slow Queries Enable Query Logging 1 2 3 4 5 6 -- Log queries slower than 500ms ALTER SYSTEM SET log_min_duration_statement = '500ms'; SELECT pg_reload_conf(); -- Check current setting SHOW log_min_duration_statement; pg_stat_statements Extension The most valuable performance tool: ...

February 25, 2026 Β· 9 min Β· 1770 words Β· Rob Washington

Ansible Playbook Patterns: Idempotent Infrastructure Done Right

Ansible’s simplicity is deceptive. Anyone can write a playbook that works once. Writing playbooks that work reliably, repeatedly, and maintainably requires discipline and patterns. Project Structure a β”œ β”œ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”œ β”‚ β”‚ β”‚ β”œ β”‚ β”‚ β”‚ β”” n ─ ─ ─ ─ ─ s ─ ─ ─ ─ ─ i b a i β”œ β”‚ β”‚ β”‚ β”‚ β”” p β”œ β”œ β”” r β”œ β”œ β”” f β”” l n n ─ ─ l ─ ─ ─ o ─ ─ ─ i ─ e s v ─ ─ a ─ ─ ─ l ─ ─ ─ l ─ / i e y e e b n p β”œ β”” s β”œ β”” b s w d s c n p s s l t r ─ ─ t ─ ─ o i e a / o g o / c e o o ─ ─ a ─ ─ o t b t m i s r . r d g k e s a m n t i c y u h g β”œ β”” i h g s . e b o x g p f / c o r ─ ─ n o r y r a n r t g t s o ─ ─ g s m v s e s i t u / t u l e e s / o s p a w s p r s q n . _ l e . _ s . l y v l b y v . y / m a . s m a y m l r y e l r m l s m r s l / l v / e r s . y m l Inventory Best Practices 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 # inventory/production/hosts.yml all: children: webservers: hosts: web1.example.com: web2.example.com: vars: http_port: 80 databases: hosts: db1.example.com: postgres_role: primary db2.example.com: postgres_role: replica loadbalancers: hosts: lb1.example.com: vars: ansible_user: deploy ansible_python_interpreter: /usr/bin/python3 Role Structure r β”œ β”‚ β”œ β”‚ β”œ β”‚ β”‚ β”‚ β”œ β”‚ β”œ β”‚ β”œ β”‚ β”œ β”‚ β”” o ─ ─ ─ ─ ─ ─ ─ ─ l ─ ─ ─ ─ ─ ─ ─ ─ e s d β”” v β”” t β”œ β”œ β”” h β”” t β”” f β”” m β”” m β”” / e ─ a ─ a ─ ─ ─ a ─ e ─ i ─ e ─ o ─ n f ─ r ─ s ─ ─ ─ n ─ m ─ l ─ t ─ l ─ g a s k d p e a e i u m / m s m i c l m l n s s / m c d n l a a / a n o e a a g / s a u e x t i i i s n r i t i l i l f / s n n n t f s n e n / n e a / . . . a i / . s x . / u y y y l g y / . y l m m m l u m c m t l l l . r l o l / y e n m . f l y . m j l 2 # # # # # # D R E S D T e o n e e e f l t r p s a e r v e t u y i n i l v c d n t a p e e g r o n v i i r c a a n e i r b t s e i l t s a e a b s r l t e ( s h h i a ( g n l h d o e l w r e e r s p s t r i p o r r i i o t r y i ) t y ) Task Patterns Always Name Tasks 1 2 3 4 5 6 7 8 9 10 # Bad - apt: name: nginx state: present # Good - name: Install nginx apt: name: nginx state: present Use Block for Related Tasks 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 - name: Configure SSL block: - name: Copy SSL certificate copy: src: "{{ ssl_cert }}" dest: /etc/ssl/certs/ mode: '0644' - name: Copy SSL private key copy: src: "{{ ssl_key }}" dest: /etc/ssl/private/ mode: '0600' - name: Enable SSL site file: src: /etc/nginx/sites-available/ssl.conf dest: /etc/nginx/sites-enabled/ssl.conf state: link notify: Reload nginx when: ssl_enabled | bool Error Handling 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 - name: Deploy application block: - name: Pull latest code git: repo: "{{ app_repo }}" dest: "{{ app_path }}" version: "{{ app_version }}" register: git_result - name: Run migrations command: ./manage.py migrate args: chdir: "{{ app_path }}" when: git_result.changed rescue: - name: Rollback to previous version git: repo: "{{ app_repo }}" dest: "{{ app_path }}" version: "{{ previous_version }}" - name: Notify failure slack: token: "{{ slack_token }}" msg: "Deploy failed on {{ inventory_hostname }}" always: - name: Clean up temp files file: path: /tmp/deploy state: absent Variables Variable Precedence (use intentionally) 1 2 3 4 5 6 7 8 9 10 11 12 13 # defaults/main.yml - Easily overridden defaults nginx_worker_processes: auto nginx_worker_connections: 1024 # vars/main.yml - Role-specific constants nginx_user: www-data nginx_conf_path: /etc/nginx/nginx.conf # group_vars/webservers.yml - Group-specific nginx_worker_connections: 4096 # host_vars/web1.yml - Host-specific nginx_worker_processes: 4 Variable Validation 1 2 3 4 5 6 7 8 - name: Validate required variables assert: that: - app_version is defined - app_version | length > 0 - db_password is defined fail_msg: "Required variables are not set" success_msg: "All required variables present" Default Values 1 2 3 4 5 6 7 8 - name: Set configuration template: src: config.j2 dest: /etc/app/config.yml vars: max_connections: "{{ app_max_connections | default(100) }}" timeout: "{{ app_timeout | default(30) }}" debug: "{{ app_debug | default(false) | bool }}" Handlers 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # handlers/main.yml - name: Reload nginx service: name: nginx state: reloaded listen: "reload web server" - name: Restart nginx service: name: nginx state: restarted listen: "restart web server" # In tasks - name: Update nginx config template: src: nginx.conf.j2 dest: /etc/nginx/nginx.conf notify: "reload web server" Flush Handlers When Needed 1 2 3 4 5 6 7 8 9 10 11 12 13 14 - name: Install nginx apt: name: nginx state: present notify: Start nginx - name: Flush handlers meta: flush_handlers - name: Configure nginx template: src: nginx.conf.j2 dest: /etc/nginx/nginx.conf # nginx is now guaranteed to be running Conditionals Clean Conditionals 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 # Use bool filter for string booleans - name: Enable debug mode template: src: debug.conf.j2 dest: /etc/app/debug.conf when: debug_mode | bool # Check if variable is defined and not empty - name: Set custom config copy: content: "{{ custom_config }}" dest: /etc/app/custom.conf when: - custom_config is defined - custom_config | length > 0 # Multiple conditions - name: Deploy to production include_tasks: deploy.yml when: - env == 'production' - deploy_enabled | bool - inventory_hostname in groups['webservers'] Loops Modern Loop Syntax 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 # Simple loop - name: Install packages apt: name: "{{ item }}" state: present loop: - nginx - python3 - htop # Better: Install all at once - name: Install packages apt: name: - nginx - python3 - htop state: present # Loop with index - name: Create users user: name: "{{ item.name }}" uid: "{{ 1000 + index }}" groups: "{{ item.groups }}" loop: "{{ users }}" loop_control: index_var: index label: "{{ item.name }}" # Cleaner output # Dict loop - name: Configure services template: src: "{{ item.key }}.conf.j2" dest: "/etc/{{ item.key }}/config.conf" loop: "{{ services | dict2items }}" when: item.value.enabled | bool Templates Jinja2 Best Practices 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 {# templates/nginx.conf.j2 #} # Managed by Ansible - DO NOT EDIT # Last updated: {{ ansible_date_time.iso8601 }} # Host: {{ inventory_hostname }} worker_processes {{ nginx_worker_processes }}; events { worker_connections {{ nginx_worker_connections }}; } http { {% for server in nginx_servers %} server { listen {{ server.port | default(80) }}; server_name {{ server.name }}; {% if server.ssl | default(false) %} ssl_certificate {{ server.ssl_cert }}; ssl_certificate_key {{ server.ssl_key }}; {% endif %} {% for location in server.locations | default([]) %} location {{ location.path }} { {{ location.directive }}; } {% endfor %} } {% endfor %} } Template Validation 1 2 3 4 5 6 - name: Generate nginx config template: src: nginx.conf.j2 dest: /etc/nginx/nginx.conf validate: nginx -t -c %s notify: Reload nginx Idempotency Patterns Check Mode Support 1 2 3 4 5 6 7 8 9 10 11 - name: Get current version command: cat /opt/app/VERSION register: current_version changed_when: false check_mode: false # Always run, even in check mode - name: Deploy new version unarchive: src: "app-{{ target_version }}.tar.gz" dest: /opt/app/ when: current_version.stdout != target_version Custom Changed Conditions 1 2 3 4 5 - name: Run database migration command: ./manage.py migrate --check register: migration_check changed_when: "'No migrations to apply' not in migration_check.stdout" failed_when: migration_check.rc not in [0, 1] Avoid Command When Possible 1 2 3 4 5 6 7 8 9 10 # Bad - not idempotent - name: Create directory command: mkdir -p /opt/app # Good - idempotent - name: Create directory file: path: /opt/app state: directory mode: '0755' Secrets Management Ansible Vault 1 2 3 4 5 6 7 8 9 10 11 # Create encrypted file ansible-vault create secrets.yml # Edit encrypted file ansible-vault edit secrets.yml # Use in playbook ansible-playbook site.yml --ask-vault-pass # Or with password file ansible-playbook site.yml --vault-password-file ~/.vault_pass 1 2 3 4 5 6 7 8 # Encrypted variables file # group_vars/all/vault.yml vault_db_password: !vault | $ANSIBLE_VAULT;1.1;AES256 ... # Reference in playbook db_password: "{{ vault_db_password }}" No Secrets in Logs 1 2 3 4 5 - name: Set database password mysql_user: name: app password: "{{ db_password }}" no_log: true Performance Gather Facts Selectively 1 2 3 4 5 6 7 8 9 10 11 - hosts: webservers gather_facts: false tasks: - name: Quick task without facts ping: # Or gather specific facts - hosts: webservers gather_subset: - network - hardware Async for Long Tasks 1 2 3 4 5 6 7 8 9 10 11 12 13 - name: Run long backup command: /opt/scripts/backup.sh async: 3600 # 1 hour timeout poll: 0 # Don't wait register: backup_job - name: Check backup status async_status: jid: "{{ backup_job.ansible_job_id }}" register: job_result until: job_result.finished retries: 60 delay: 60 Limit Concurrent Execution 1 2 3 4 - hosts: webservers serial: 2 # Two hosts at a time # Or percentage: serial: "25%" # Or batches: serial: [1, 5, 10] Testing with Molecule 1 2 3 4 5 6 7 8 9 10 11 12 13 # molecule/default/molecule.yml dependency: name: galaxy driver: name: docker platforms: - name: instance image: ubuntu:22.04 pre_build_image: true provisioner: name: ansible verifier: name: ansible 1 2 3 4 5 6 7 8 9 10 11 # molecule/default/verify.yml - name: Verify hosts: all tasks: - name: Check nginx is running service: name: nginx state: started check_mode: true register: result failed_when: result.changed 1 2 # Run tests molecule test Good Ansible is boring Ansible. No surprises, no side effects, same result every time. When your playbooks are truly idempotent, running them becomes a confidence-builder rather than a risk. ...

February 25, 2026 Β· 10 min Β· 2060 words Β· Rob Washington