awk is a programming language disguised as a command-line tool. It processes text line by line, splitting each into fields. Most tasks need just one line.

The Basics

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# Print entire line
awk '{print}' file.txt

# Print specific field (space-delimited)
awk '{print $1}' file.txt      # First field
awk '{print $2}' file.txt      # Second field
awk '{print $NF}' file.txt     # Last field
awk '{print $(NF-1)}' file.txt # Second to last

# Print multiple fields
awk '{print $1, $3}' file.txt

# Custom output format
awk '{print $1 " -> " $2}' file.txt

Field Separators

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Colon-separated (like /etc/passwd)
awk -F: '{print $1}' /etc/passwd

# Tab-separated
awk -F'\t' '{print $2}' data.tsv

# Multiple separators
awk -F'[,;]' '{print $1}' file.txt

# Set output separator
awk -F: 'BEGIN{OFS=","} {print $1,$3}' /etc/passwd

Filtering Lines

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# Lines matching pattern
awk '/error/' logfile.txt

# Lines NOT matching pattern
awk '!/debug/' logfile.txt

# Field matches value
awk '$3 == "ERROR"' logfile.txt

# Numeric comparison
awk '$2 > 100' data.txt

# Multiple conditions
awk '$2 > 100 && $3 == "active"' data.txt

# Line number range
awk 'NR >= 10 && NR <= 20' file.txt

Built-in Variables

1
2
3
4
5
6
7
NR    # Current line number (total)
NF    # Number of fields in current line
FNR   # Line number in current file
FS    # Field separator (input)
OFS   # Output field separator
RS    # Record separator (default: newline)
ORS   # Output record separator
1
2
3
4
5
6
7
8
# Print line numbers
awk '{print NR, $0}' file.txt

# Print lines with more than 3 fields
awk 'NF > 3' file.txt

# Print total lines at end
awk 'END{print NR}' file.txt

Arithmetic

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Sum a column
awk '{sum += $2} END{print sum}' data.txt

# Average
awk '{sum += $2; count++} END{print sum/count}' data.txt

# Min/Max
awk 'NR==1{min=max=$2} $2>max{max=$2} $2<min{min=$2} END{print min, max}' data.txt

# Calculate percentage
awk '{print $1, $2, ($2/$3)*100 "%"}' data.txt

String Operations

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# Length of field
awk '{print length($1)}' file.txt

# Substring
awk '{print substr($1, 1, 3)}' file.txt  # First 3 chars

# Convert case
awk '{print toupper($1)}' file.txt
awk '{print tolower($1)}' file.txt

# String concatenation
awk '{print $1 $2}' file.txt      # No space
awk '{print $1 " " $2}' file.txt  # With space

# Split string
awk '{split($1, arr, "-"); print arr[1]}' file.txt

Conditional Logic

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# If-else
awk '{if ($2 > 100) print "high"; else print "low"}' data.txt

# Ternary
awk '{print ($2 > 100 ? "high" : "low")}' data.txt

# Multiple conditions
awk '{
    if ($2 > 100) status = "high"
    else if ($2 > 50) status = "medium"
    else status = "low"
    print $1, status
}' data.txt

BEGIN and END

1
2
3
4
5
# Header and footer
awk 'BEGIN{print "Name\tScore"} {print $1"\t"$2} END{print "---\nTotal: " NR}' data.txt

# Initialize variables
awk 'BEGIN{count=0} /error/{count++} END{print count " errors"}' logfile.txt

Practical One-Liners

Log Analysis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Count occurrences of each status code
awk '{print $9}' access.log | sort | uniq -c | sort -rn

# Or all in awk
awk '{count[$9]++} END{for (code in count) print count[code], code}' access.log

# Requests per IP
awk '{count[$1]++} END{for (ip in count) print count[ip], ip}' access.log | sort -rn | head

# Slow requests (response time > 1s)
awk '$NF > 1.0 {print $7, $NF}' access.log

CSV Processing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Print specific columns
awk -F, '{print $1","$3}' data.csv

# Skip header
awk -F, 'NR > 1 {print $2}' data.csv

# Sum a column
awk -F, 'NR > 1 {sum += $3} END{print sum}' data.csv

# Filter by value
awk -F, '$4 == "active"' data.csv

System Administration

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Disk usage over 80%
df -h | awk '$5+0 > 80 {print $6, $5}'

# Memory by process
ps aux | awk '{mem[$11] += $6} END{for (proc in mem) print mem[proc], proc}' | sort -rn | head

# Users with bash shell
awk -F: '$7 ~ /bash/ {print $1}' /etc/passwd

# Show listening ports
netstat -tlnp | awk '$6 == "LISTEN" {print $4}'

Data Transformation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Transpose rows to columns
awk '{for (i=1; i<=NF; i++) a[i,NR]=$i} END{for (i=1; i<=NF; i++) {for (j=1; j<=NR; j++) printf a[i,j] " "; print ""}}' file.txt

# Remove duplicate lines (preserving order)
awk '!seen[$0]++' file.txt

# Print unique values from column
awk '{print $2}' file.txt | awk '!seen[$0]++'

# Join lines with comma
awk '{printf "%s%s", sep, $0; sep=","} END{print ""}' file.txt

Text Manipulation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# Remove blank lines
awk 'NF' file.txt

# Remove leading/trailing whitespace
awk '{$1=$1}1' file.txt

# Replace field value
awk '{$2 = "REDACTED"; print}' file.txt

# Add line numbers
awk '{print NR": "$0}' file.txt

# Print every Nth line
awk 'NR % 5 == 0' file.txt

Combining with Other Tools

1
2
3
4
5
6
7
8
# Filter then process
grep "ERROR" logfile.txt | awk '{print $5}'

# Process then sort
awk -F: '{print $3, $1}' /etc/passwd | sort -n

# Use in pipeline
cat data.txt | awk '{print $2}' | sort | uniq -c

Multi-line Scripts

For complex logic, use a script file:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/awk -f
# script.awk

BEGIN {
    FS = ","
    print "Processing..."
}

NR == 1 {
    # Save header
    for (i = 1; i <= NF; i++) header[i] = $i
    next
}

$4 > 100 {
    # Process qualifying rows
    total += $4
    count++
    print $1, $2, $4
}

END {
    print "---"
    print "Count:", count
    print "Total:", total
    print "Average:", total/count
}
1
awk -f script.awk data.csv

Quick Reference

TaskCommand
Print columnawk '{print $N}'
Filter rowsawk '/pattern/'
Sum columnawk '{s+=$N}END{print s}'
Count linesawk 'END{print NR}'
Custom delimiterawk -F','
Field equalsawk '$N == "value"'
Skip headerawk 'NR > 1'
Unique linesawk '!seen[$0]++'
Last fieldawk '{print $NF}'

awk’s learning curve is shallow for simple tasks and deep for complex ones. Start with '{print $N}' and filtering, then add arithmetic and conditionals as needed. Most text processing tasks that seem complex become simple one-liners once you know the patterns.