Why Linux for DevOps?
BeginnerLinux powers 96.3% of the world's top servers, all major cloud providers, and is the foundation of Docker containers. As a DevOps engineer, Linux proficiency is non-negotiable.
- Runs on virtually all production servers
- Docker containers are built on Linux
- Most CI/CD pipelines execute on Linux
- Open source with massive community support
Filesystem Hierarchy
Beginner| Directory | Purpose |
|---|---|
/ | Root directory — top of the hierarchy |
/home | User home directories |
/etc | System configuration files |
/var | Variable data (logs, databases, mail) |
/tmp | Temporary files |
/opt | Optional/third-party software |
/usr | User programs and utilities |
/bin, /sbin | Essential system binaries |
/proc | Virtual filesystem for process info |
/dev | Device files |
Essential Commands
Beginner# Navigation & File Operations
pwd # Print working directory
ls -la # List with details and hidden files
cd /var/log # Change directory
mkdir -p /opt/myapp/config # Create nested directories
cp -r source/ dest/ # Copy recursively
mv oldname newname # Move/rename
rm -rf /tmp/old-builds # Remove recursively (use with caution!)
find / -name "*.log" -mtime +7 # Find files modified 7+ days ago
locate nginx.conf # Quick file search (needs updatedb)
# Viewing & Editing Files
cat /etc/hostname # Display file contents
less /var/log/syslog # Paginated file viewer
head -n 20 app.log # First 20 lines
tail -f /var/log/nginx/access.log # Follow log in real-time
grep -rn "error" /var/log/ # Search recursively with line numbers
grep -i "warn\|error" app.log # Case-insensitive, multiple patterns
sed -i 's/old/new/g' file.txt # Find & replace in-place
awk '{print $1, $9}' access.log # Extract columns
# Text Processing Pipelines
cat access.log | awk '{print $1}' | sort | uniq -c | sort -rn | head -10
# ^ Top 10 IP addresses by request count
# Disk & Storage
df -h # Disk usage (human readable)
du -sh /var/* # Directory sizes
lsblk # Block devices
mount /dev/sdb1 /mnt/data # Mount filesystem
File Permissions
Beginner# Permission format: rwxrwxrwx (owner-group-others)
# r=4, w=2, x=1
chmod 755 script.sh # rwxr-xr-x (owner can do all, others read/execute)
chmod 644 config.yml # rw-r--r-- (owner read/write, others read-only)
chmod 600 private_key.pem # rw------- (only owner can read/write)
chmod +x deploy.sh # Add execute permission
# Change ownership
chown ubuntu:www-data /var/www/html
chown -R appuser:appgroup /opt/myapp/
# Special permissions
chmod u+s /usr/bin/passwd # SUID — runs as file owner
chmod g+s /shared/team/ # SGID — new files inherit group
chmod +t /tmp # Sticky bit — only owner can delete
Package Management
Beginner# Debian/Ubuntu (APT)
sudo apt update # Update package index
sudo apt upgrade -y # Upgrade all packages
sudo apt install -y nginx # Install package
sudo apt remove nginx # Remove package
sudo apt autoremove # Remove unused dependencies
dpkg -l | grep nginx # List installed matching packages
# RHEL/CentOS/Amazon Linux (YUM/DNF)
sudo yum update -y
sudo yum install -y httpd
sudo dnf install -y docker
rpm -qa | grep docker # List installed RPMs
Process Management
Intermediate# View processes
ps aux # All processes with details
ps aux | grep nginx # Filter by name
top # Interactive process monitor
htop # Better interactive monitor
pstree # Process tree
# Process control
kill PID # Graceful terminate (SIGTERM)
kill -9 PID # Force kill (SIGKILL)
killall nginx # Kill all by name
pkill -f "python app.py" # Kill by pattern
# Background processes
./long-task.sh & # Run in background
nohup ./server.sh & # Survives terminal close
jobs # List background jobs
fg %1 # Bring job to foreground
# Resource usage
free -h # Memory usage
vmstat 1 5 # Virtual memory stats
iostat -x 1 # I/O statistics
lsof -i :80 # What's using port 80
ss -tulpn # Socket statistics (replaces netstat)
Networking Commands
Intermediate# Network configuration
ip addr show # Show IP addresses
ip route show # Show routing table
hostname -I # Quick IP display
# DNS
dig example.com # DNS lookup
nslookup example.com # Name server lookup
cat /etc/resolv.conf # DNS resolver config
# Connectivity
ping -c 4 google.com # Test connectivity
traceroute google.com # Trace network path
curl -I https://example.com # HTTP headers
wget https://example.com/file # Download file
# Firewall (iptables / ufw)
sudo ufw enable
sudo ufw allow 22/tcp
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw status verbose
# Port scanning & monitoring
ss -tulpn # All listening ports
netstat -tlnp # Listening TCP ports
nmap -sV localhost # Service version scan
Shell Scripting
Intermediate#!/bin/bash
# deploy.sh — Automated deployment script
set -euo pipefail # Exit on error, undefined vars, pipe failures
# Variables
APP_NAME="myapp"
DEPLOY_DIR="/opt/${APP_NAME}"
BACKUP_DIR="/opt/backups/${APP_NAME}"
LOG_FILE="/var/log/${APP_NAME}/deploy.log"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
# Functions
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
rollback() {
log "ERROR: Deployment failed. Rolling back..."
if [[ -d "${BACKUP_DIR}/latest" ]]; then
rm -rf "${DEPLOY_DIR}"
cp -r "${BACKUP_DIR}/latest" "${DEPLOY_DIR}"
systemctl restart "${APP_NAME}"
log "Rollback completed successfully"
fi
exit 1
}
# Trap errors for automatic rollback
trap rollback ERR
# Main deployment
log "Starting deployment of ${APP_NAME}"
# 1. Create backup
log "Creating backup..."
mkdir -p "${BACKUP_DIR}"
cp -r "${DEPLOY_DIR}" "${BACKUP_DIR}/${TIMESTAMP}"
ln -sfn "${BACKUP_DIR}/${TIMESTAMP}" "${BACKUP_DIR}/latest"
# 2. Deploy new version
log "Deploying new version..."
cd "${DEPLOY_DIR}"
git pull origin main
npm ci --production
npm run build
# 3. Restart service
log "Restarting service..."
systemctl restart "${APP_NAME}"
# 4. Health check
log "Running health check..."
sleep 5
if curl -sf http://localhost:3000/health > /dev/null; then
log "✅ Deployment successful!"
else
log "❌ Health check failed!"
rollback
fi
# 5. Cleanup old backups (keep last 5)
ls -dt "${BACKUP_DIR}"/[0-9]* | tail -n +6 | xargs rm -rf
log "Cleanup complete. Deployment finished."
Systemd & Services
Intermediate# Service management
sudo systemctl start nginx # Start service
sudo systemctl stop nginx # Stop service
sudo systemctl restart nginx # Restart
sudo systemctl reload nginx # Reload config (no downtime)
sudo systemctl enable nginx # Start on boot
sudo systemctl disable nginx # Don't start on boot
sudo systemctl status nginx # Check status
# Create a custom systemd service
# /etc/systemd/system/myapp.service
[Unit]
Description=My Application
After=network.target
Wants=network-online.target
[Service]
Type=simple
User=appuser
Group=appgroup
WorkingDirectory=/opt/myapp
ExecStart=/usr/bin/node /opt/myapp/server.js
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5
StandardOutput=journal
StandardError=journal
Environment=NODE_ENV=production
Environment=PORT=3000
[Install]
WantedBy=multi-user.target
# Enable and start the service
sudo systemctl daemon-reload
sudo systemctl enable myapp
sudo systemctl start myapp
# View logs
journalctl -u myapp -f # Follow service logs
journalctl -u myapp --since today # Today's logs only
journalctl -u myapp -p err # Only error-level logs
Performance Tuning
Advanced# System resource monitoring
top -bn1 | head -20 # Snapshot of top processes
vmstat 1 10 # Memory/CPU stats every second
sar -u 1 10 # CPU utilization
sar -r 1 10 # Memory utilization
sar -d 1 10 # Disk I/O
# Kernel parameter tuning (/etc/sysctl.conf)
# Network performance
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535
net.core.netdev_max_backlog = 65535
net.ipv4.tcp_fin_timeout = 15
# File handles
fs.file-max = 2097152
fs.nr_open = 2097152
# Apply changes
sudo sysctl -p
# File descriptor limits (/etc/security/limits.conf)
* soft nofile 65535
* hard nofile 65535
Security Hardening
Advanced- SSH Hardening — Disable root login, use key-based auth
- Firewall — Only open necessary ports
- Updates — Enable automatic security updates
- fail2ban — Block repeated failed login attempts
- Audit — Enable auditd for system call logging
- SELinux/AppArmor — Mandatory access controls
# SSH hardening (/etc/ssh/sshd_config)
PermitRootLogin no
PasswordAuthentication no
PubkeyAuthentication yes
MaxAuthTries 3
ClientAliveInterval 300
ClientAliveCountMax 2
AllowUsers deploy admin
# Apply SSH changes
sudo systemctl restart sshd
# Install and configure fail2ban
sudo apt install -y fail2ban
sudo systemctl enable fail2ban
sudo systemctl start fail2ban
# Automatic security updates
sudo apt install -y unattended-upgrades
sudo dpkg-reconfigure -plow unattended-upgrades
Troubleshooting
Advanced Troubleshooting Methodology
Follow this order: Check logs → Check resources (CPU/memory/disk) → Check network → Check configuration → Check permissions.
# Check system logs
journalctl -xe # Recent errors
dmesg | tail -20 # Kernel messages
cat /var/log/syslog | tail -50 # System log
# Disk issues
df -h # Is any filesystem full?
du -sh /var/* | sort -rh | head # Largest directories
find / -size +100M -type f # Large files
# Memory issues
free -h # Memory overview
cat /proc/meminfo # Detailed memory info
slabtop # Kernel slab cache
# Network issues
ss -tulpn # Listening ports
iptables -L -n # Firewall rules
tcpdump -i eth0 port 80 # Capture traffic
curl -v http://localhost:3000 # Verbose HTTP test
# High CPU
top -o %CPU # Sort by CPU usage
strace -p PID # Trace system calls
perf top # Performance counters