It’s 3 AM. Production is down. You’re grepping through logs, but everything is INFO: Processing request... followed by ERROR: Something went wrong.
Good luck.
Here’s how to log like you’ll actually need to debug this later.
The Basics#
Log Levels#
Use them correctly:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| import logging
logger = logging.getLogger(__name__)
# DEBUG: Detailed diagnostic info (dev only)
logger.debug(f"Processing user {user_id}, cart items: {cart}")
# INFO: Normal operation events
logger.info(f"Order {order_id} created for user {user_id}")
# WARNING: Unexpected but handled
logger.warning(f"Payment retry {attempt}/3 for order {order_id}")
# ERROR: Something failed
logger.error(f"Payment failed for order {order_id}: {error}")
# CRITICAL: System is unusable
logger.critical("Database connection pool exhausted")
|
The rule: Could someone be paged for this at 3 AM?
- Yes → ERROR or CRITICAL
- No → INFO or below
Structured Logging#
Stop logging strings. Log data:
1
2
3
4
5
6
7
8
9
10
| # Bad
logger.info(f"User {user_id} placed order {order_id} for ${total}")
# Good
logger.info("order_created", extra={
"user_id": user_id,
"order_id": order_id,
"total": total,
"items": len(cart)
})
|
With structlog:
1
2
3
4
5
6
7
8
9
10
11
| import structlog
logger = structlog.get_logger()
logger.info(
"order_created",
user_id=user_id,
order_id=order_id,
total=total,
items=len(cart)
)
|
Output (JSON):
1
2
3
4
5
6
7
8
9
| {
"event": "order_created",
"user_id": 123,
"order_id": "abc-456",
"total": 99.99,
"items": 3,
"timestamp": "2026-02-10T22:30:00Z",
"level": "info"
}
|
Now you can query: “Show me all orders over $100 that failed.”
Correlation IDs#
Track requests across services:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
| import uuid
from contextvars import ContextVar
request_id: ContextVar[str] = ContextVar('request_id', default='')
# Middleware
@app.before_request
def set_request_id():
req_id = request.headers.get('X-Request-ID', str(uuid.uuid4()))
request_id.set(req_id)
# Add to all logs automatically
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer()
],
context_class=dict,
)
# Now every log includes request_id
structlog.contextvars.bind_contextvars(request_id=request_id.get())
|
Pass it to downstream services:
1
2
3
4
5
6
7
| def call_payment_service(order):
response = requests.post(
"http://payments/charge",
json=order,
headers={"X-Request-ID": request_id.get()}
)
return response
|
Now you can trace a request across your entire system.
What to Log#
Do Log#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| # Request boundaries
logger.info("request_started", method="POST", path="/orders")
logger.info("request_completed", status=201, duration_ms=145)
# Business events
logger.info("user_registered", user_id=123, plan="premium")
logger.info("payment_processed", order_id="abc", amount=99.99)
# Decisions
logger.info("cache_miss", key="user:123:profile")
logger.info("rate_limit_applied", user_id=123, limit=100)
# Failures with context
logger.error("payment_failed",
order_id="abc",
error_code="card_declined",
retry_count=2
)
|
Don’t Log#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| # Secrets
logger.info(f"Connecting with password {password}") # NO
# PII without consent
logger.info(f"User email: {email}") # Careful
# High-volume repetitive noise
for item in million_items:
logger.debug(f"Processing {item}") # NO
# Stack traces for expected errors
try:
user = get_user(id)
except UserNotFound:
logger.exception("User not found") # Overkill
|
Log Aggregation#
Ship logs to a central system:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
| # Using Python logging with JSON
import logging
import json
class JSONFormatter(logging.Formatter):
def format(self, record):
log_record = {
"timestamp": self.formatTime(record),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
"module": record.module,
}
if hasattr(record, 'extra'):
log_record.update(record.extra)
if record.exc_info:
log_record["exception"] = self.formatException(record.exc_info)
return json.dumps(log_record)
# Configure
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logging.root.addHandler(handler)
|
Docker logging:
1
2
3
4
5
6
7
8
| # docker-compose.yml
services:
app:
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
|
Ship to Loki/Elasticsearch:
1
2
3
4
5
6
7
8
9
| # promtail config for Loki
scrape_configs:
- job_name: containers
static_configs:
- targets:
- localhost
labels:
job: docker
__path__: /var/lib/docker/containers/*/*log
|
Debugging Patterns#
The Breadcrumb Trail#
Log entry and exit:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| def process_order(order_id):
logger.info("process_order_started", order_id=order_id)
try:
order = fetch_order(order_id)
logger.debug("order_fetched", status=order.status)
if order.needs_payment:
logger.info("payment_required", amount=order.total)
charge_result = process_payment(order)
logger.info("payment_completed",
transaction_id=charge_result.id)
fulfill_order(order)
logger.info("process_order_completed", order_id=order_id)
except PaymentError as e:
logger.error("process_order_failed",
order_id=order_id,
stage="payment",
error=str(e))
raise
|
Timing#
Track where time goes:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| import time
from contextlib import contextmanager
@contextmanager
def log_timing(operation, **context):
start = time.perf_counter()
try:
yield
finally:
duration_ms = (time.perf_counter() - start) * 1000
logger.info(f"{operation}_completed",
duration_ms=round(duration_ms, 2),
**context)
# Usage
with log_timing("database_query", table="orders"):
orders = db.query(Order).filter(Order.status == "pending").all()
with log_timing("external_api_call", service="stripe"):
result = stripe.charges.create(amount=1000)
|
Sampling#
For high-volume paths:
1
2
3
4
5
6
7
8
9
10
| import random
def should_log_sample(rate=0.01):
return random.random() < rate
# Log 1% of requests in detail
if should_log_sample(0.01):
logger.debug("request_details",
headers=dict(request.headers),
body=request.get_json())
|
Production Debugging Checklist#
When something breaks:
- Find the request ID from user report or error tracker
- Search logs for that ID across all services
- Look for the last successful log before the error
- Check timing — did something take too long?
- Find related requests — is it affecting multiple users?
- Check for patterns — specific user? specific endpoint?
1
2
3
4
| # Grep examples
grep "request_id=abc-123" /var/log/app/*.log
grep "error" /var/log/app/*.log | grep "2026-02-10T22"
jq 'select(.level=="error")' /var/log/app/app.log
|
The Log Review#
Periodically review your logs:
1
2
3
4
5
6
| # Questions to ask:
# 1. If this errors, do I have enough context to debug?
# 2. Am I logging PII I shouldn't be?
# 3. Is anything too noisy? (>1000 logs/min for a single event)
# 4. Can I trace a request end-to-end?
# 5. Do my error logs include the "why"?
|
Quick Setup#
Python with structlog:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
| import structlog
import logging
import sys
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
logging.basicConfig(
format="%(message)s",
stream=sys.stdout,
level=logging.INFO,
)
|
The Bottom Line#
Log for the person debugging at 3 AM. That person is future you.
Every log should answer: What happened? To whom? When? Why does it matter?
Got a logging horror story? Share it on Twitter.