Linux System Monitoring and Troubleshooting

Overview

System monitoring involves tracking performance metrics, analyzing logs, and setting up alerts to maintain system health and quickly identify issues.

Performance Monitoring Tools

htop - Interactive Process Viewer

# Install and use htop
sudo apt install htop
htop

# Key htop features:
# - Real-time process monitoring
# - CPU/memory usage per core
# - Process tree view (F5)
# - Kill processes (F9)
# - Search processes (F3)
# - Filter by user (u)

# Configure htop colors and layout
# Setup -> Display options -> Detailed CPU time

System Resource Monitoring

# CPU monitoring
top                           # Classic process monitor
iostat 1                      # CPU and I/O statistics
vmstat 1                      # Virtual memory stats
sar -u 1 10                   # CPU utilization over time
mpstat 1                      # Multi-processor statistics

# Memory monitoring
free -h                       # Memory usage
watch -n 1 'free -h'         # Continuous memory monitoring
cat /proc/meminfo             # Detailed memory information
pmap -x PID                   # Process memory mapping

# Disk I/O monitoring
iotop                         # I/O usage by process
iotop -o                      # Only show active I/O
iostat -x 1                   # Extended I/O statistics
atop                          # Advanced system monitor

Network Monitoring

# Network traffic monitoring
iftop                         # Interface bandwidth usage
nethogs                       # Network usage by process
vnstat                        # Network traffic statistics
ss -tuln                      # Socket statistics
netstat -i                    # Interface statistics

# Real-time network monitoring
watch -n 1 'ss -tuln | grep :80'
tcpdump -i eth0               # Packet capture
wireshark                     # GUI packet analyzer

# Network performance testing
iperf3 -s                     # Server mode
iperf3 -c server_ip           # Client test
ping -c 10 8.8.8.8           # Latency test
traceroute google.com         # Route tracing

Log Management and Analysis

System Logs Location

# Main log directories
/var/log/                     # Primary log directory
/var/log/syslog              # General system messages
/var/log/auth.log            # Authentication logs
/var/log/kern.log            # Kernel messages
/var/log/mail.log            # Mail server logs
/var/log/apache2/            # Apache web server logs
/var/log/nginx/              # Nginx web server logs

# Application-specific logs
/var/log/mysql/              # MySQL logs
/var/log/postgresql/         # PostgreSQL logs
/var/log/fail2ban.log        # Fail2ban logs

Log Analysis with journalctl

# Basic journalctl usage
journalctl                    # All journal entries
journalctl -f                 # Follow logs (tail -f style)
journalctl -n 50              # Last 50 entries
journalctl --since "1 hour ago"  # Last hour
journalctl --since "2024-01-01"  # Since specific date

# Filter by service
journalctl -u ssh             # SSH service logs
journalctl -u apache2         # Apache service logs
journalctl -u nginx           # Nginx service logs

# Filter by priority
journalctl -p err             # Error level and above
journalctl -p warning         # Warning level and above
journalctl -p crit            # Critical level and above

# Advanced filtering
journalctl _COMM=sshd         # SSH daemon messages
journalctl _UID=1000          # Messages from specific user
journalctl -k                 # Kernel messages only
journalctl --list-boots       # Available boot sessions

Log Analysis Scripts

#!/bin/bash
# Log analysis script for security events

LOG_FILE="/var/log/auth.log"
REPORT_FILE="/tmp/security-report-$(date +%Y%m%d).txt"

echo "Security Log Analysis - $(date)" > "$REPORT_FILE"
echo "=================================" >> "$REPORT_FILE"

# Failed SSH attempts
echo "Failed SSH Login Attempts:" >> "$REPORT_FILE"
grep "Failed password" "$LOG_FILE" | tail -20 >> "$REPORT_FILE"

# Successful logins
echo -e "\nSuccessful SSH Logins:" >> "$REPORT_FILE"
grep "Accepted password" "$LOG_FILE" | tail -10 >> "$REPORT_FILE"

# Sudo usage
echo -e "\nSudo Commands:" >> "$REPORT_FILE"
grep "sudo:" "$LOG_FILE" | tail -10 >> "$REPORT_FILE"

# Summary statistics
FAILED_COUNT=$(grep -c "Failed password" "$LOG_FILE")
SUCCESS_COUNT=$(grep -c "Accepted password" "$LOG_FILE")
SUDO_COUNT=$(grep -c "sudo:" "$LOG_FILE")

echo -e "\nSummary:" >> "$REPORT_FILE"
echo "Failed logins: $FAILED_COUNT" >> "$REPORT_FILE"
echo "Successful logins: $SUCCESS_COUNT" >> "$REPORT_FILE"
echo "Sudo commands: $SUDO_COUNT" >> "$REPORT_FILE"

cat "$REPORT_FILE"

Alerting and Notification Systems

Email Alerts Setup

# Install mail utilities
sudo apt install mailutils postfix

# Configure postfix for local delivery
sudo dpkg-reconfigure postfix

# Test email functionality
echo "Test message" | mail -s "Test Subject" user@example.com

# Alert script for high CPU usage
#!/bin/bash
CPU_THRESHOLD=80
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')

if (( $(echo "$CPU_USAGE > $CPU_THRESHOLD" | bc -l) )); then
    echo "High CPU usage detected: ${CPU_USAGE}%" | \
    mail -s "CPU Alert - $(hostname)" admin@example.com
fi

Advanced Monitoring with Nagios

# Install Nagios
sudo apt update
sudo apt install nagios4 nagios-plugins-contrib

# Start Nagios
sudo systemctl start nagios4
sudo systemctl enable nagios4

# Access web interface at http://localhost/nagios4

# Custom Nagios check script
#!/bin/bash
# /usr/local/nagios/libexec/check_disk_usage.sh

PARTITION="$1"
WARNING="$2"
CRITICAL="$3"

if [ $# -ne 3 ]; then
    echo "Usage: $0 <partition> <warning%> <critical%>"
    exit 3
fi

USAGE=$(df "$PARTITION" | awk 'NR==2 {print $5}' | sed 's/%//')

if [ "$USAGE" -ge "$CRITICAL" ]; then
    echo "CRITICAL - Disk usage ${USAGE}% on $PARTITION"
    exit 2
elif [ "$USAGE" -ge "$WARNING" ]; then
    echo "WARNING - Disk usage ${USAGE}% on $PARTITION"
    exit 1
else
    echo "OK - Disk usage ${USAGE}% on $PARTITION"
    exit 0
fi

Prometheus and Grafana Setup

# Install Prometheus
sudo apt install prometheus

# Basic Prometheus configuration (/etc/prometheus/prometheus.yml)
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100']

# Install Node Exporter for system metrics
sudo apt install prometheus-node-exporter

# Install Grafana
sudo apt install software-properties-common
sudo add-apt-repository "deb https://packages.grafana.com/oss/deb stable main"
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
sudo apt update
sudo apt install grafana

# Start services
sudo systemctl start prometheus
sudo systemctl start prometheus-node-exporter
sudo systemctl start grafana-server
sudo systemctl enable grafana-server

# Access Grafana at http://localhost:3000 (admin/admin)

System Health Monitoring

Automated Health Checks

#!/bin/bash
# Comprehensive system health check

HEALTH_REPORT="/tmp/system-health-$(date +%Y%m%d-%H%M).txt"

echo "System Health Report - $(date)" > "$HEALTH_REPORT"
echo "======================================" >> "$HEALTH_REPORT"

# Check system load
LOAD=$(uptime | awk '{print $(NF-2)}' | sed 's/,//')
echo "System Load: $LOAD" >> "$HEALTH_REPORT"

# Check memory usage
MEMORY=$(free | awk '/^Mem:/ {printf "%.1f", $3/$2 * 100}')
echo "Memory Usage: ${MEMORY}%" >> "$HEALTH_REPORT"

# Check disk usage
echo "Disk Usage:" >> "$HEALTH_REPORT"
df -h | grep -E '^/dev/' >> "$HEALTH_REPORT"

# Check running services
echo -e "\nCritical Services Status:" >> "$HEALTH_REPORT"
SERVICES=("ssh" "apache2" "mysql" "nginx")
for service in "${SERVICES[@]}"; do
    if systemctl is-active --quiet "$service"; then
        echo "$service: Running" >> "$HEALTH_REPORT"
    else
        echo "$service: Stopped" >> "$HEALTH_REPORT"
    fi
done

# Check network connectivity
echo -e "\nNetwork Connectivity:" >> "$HEALTH_REPORT"
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
    echo "Internet: Connected" >> "$HEALTH_REPORT"
else
    echo "Internet: Disconnected" >> "$HEALTH_REPORT"
fi

# Check for errors in logs
echo -e "\nRecent Errors:" >> "$HEALTH_REPORT"
journalctl -p err --since "1 hour ago" --no-pager | tail -5 >> "$HEALTH_REPORT"

cat "$HEALTH_REPORT"

# Send alert if critical issues found
CRITICAL_ISSUES=$(grep -c "Disconnected\|Stopped\|Error" "$HEALTH_REPORT")
if [ "$CRITICAL_ISSUES" -gt 0 ]; then
    mail -s "System Health Alert - $(hostname)" admin@example.com < "$HEALTH_REPORT"
fi

Process Monitoring

# Monitor specific processes
#!/bin/bash
# Process monitoring script

PROCESSES=("apache2" "mysql" "nginx" "ssh")
LOG_FILE="/var/log/process-monitor.log"

for process in "${PROCESSES[@]}"; do
    if ! pgrep "$process" > /dev/null; then
        echo "$(date): $process is not running" >> "$LOG_FILE"

        # Attempt to restart service
        if systemctl start "$process"; then
            echo "$(date): Successfully restarted $process" >> "$LOG_FILE"
        else
            echo "$(date): Failed to restart $process" >> "$LOG_FILE"
            echo "$process failed to restart on $(hostname)" | \
            mail -s "Service Restart Failed" admin@example.com
        fi
    fi
done

# Monitor process resource usage
ps aux --sort=-%cpu | head -10 > /tmp/top-cpu-processes.txt
ps aux --sort=-%mem | head -10 > /tmp/top-memory-processes.txt

Real-time Monitoring Dashboard

Simple Web Dashboard

#!/bin/bash
# Generate HTML dashboard

DASHBOARD_FILE="/var/www/html/dashboard.html"

cat > "$DASHBOARD_FILE" << EOF
<!DOCTYPE html>
<html>
<head>
    <title>System Dashboard</title>
    <meta http-equiv="refresh" content="30">
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .metric { background: #f0f0f0; padding: 10px; margin: 10px 0; border-radius: 5px; }
        .critical { background: #ffcccc; }
        .warning { background: #ffffcc; }
        .ok { background: #ccffcc; }
    </style>
</head>
<body>
    <h1>System Dashboard - $(hostname)</h1>
    <p>Last Updated: $(date)</p>

    <div class="metric">
        <h3>System Load</h3>
        <p>$(uptime)</p>
    </div>

    <div class="metric">
        <h3>Memory Usage</h3>
        <pre>$(free -h)</pre>
    </div>

    <div class="metric">
        <h3>Disk Usage</h3>
        <pre>$(df -h | grep -E '^/dev/')</pre>
    </div>

    <div class="metric">
        <h3>Network Interfaces</h3>
        <pre>$(ip addr show | grep -E '^[0-9]+:|inet ')</pre>
    </div>

    <div class="metric">
        <h3>Top Processes (CPU)</h3>
        <pre>$(ps aux --sort=-%cpu | head -10)</pre>
    </div>

</body>
</html>
EOF

echo "Dashboard generated at $DASHBOARD_FILE"

Troubleshooting Common Issues

High CPU Usage Investigation

# Identify high CPU processes
top -bn1 | head -20
ps aux --sort=-%cpu | head -10

# Monitor CPU usage over time
sar -u 1 60 > cpu_usage.log

# Check for CPU-intensive scripts
ps aux | grep -E "(python|php|perl|bash)" | grep -v grep

# Kill runaway processes
pkill -f suspicious_process_name

# Check system load history
sar -q | tail -20

Memory Issues Diagnosis

# Check memory usage
free -h
cat /proc/meminfo

# Find memory-hungry processes
ps aux --sort=-%mem | head -10

# Check for memory leaks
valgrind --tool=memcheck --leak-check=full ./program

# Clear cache if needed (emergency only)
sync && echo 3 > /proc/sys/vm/drop_caches

# Check swap usage
swapon --show
cat /proc/swaps

Disk Space Problems

# Find large files and directories
du -sh /* | sort -h
find / -type f -size +100M 2>/dev/null
ncdu /                        # Interactive disk usage analyzer

# Clean up common locations
sudo apt autoremove           # Remove unused packages
sudo apt autoclean           # Clean package cache
sudo journalctl --vacuum-time=7d  # Clean old journal entries

# Check for deleted files still open
lsof +L1                      # Files deleted but still open

# Monitor disk I/O
iotop -o                      # Only active I/O
iostat -x 1                   # Extended statistics

Network Connectivity Issues

# Basic connectivity tests
ping -c 4 8.8.8.8            # Test external connectivity
ping -c 4 gateway_ip          # Test gateway connectivity
traceroute destination        # Trace network path
mtr destination               # Combined ping/traceroute

# Check network configuration
ip addr show                  # IP addresses
ip route show                 # Routing table
ss -tuln                      # Open ports
netstat -rn                   # Routing table (legacy)

# DNS troubleshooting
nslookup domain.com
dig domain.com
systemd-resolve --status      # systemd-resolved status

# Firewall check
sudo iptables -L -n           # Firewall rules
sudo ufw status               # UFW status

Service Troubleshooting

# Service status and logs
systemctl status service_name
journalctl -u service_name -f
journalctl -u service_name --since "1 hour ago"

# Service dependencies
systemctl list-dependencies service_name
systemctl show service_name

# Restart and reload services
sudo systemctl restart service_name
sudo systemctl reload service_name
sudo systemctl daemon-reload

# Check service configuration
systemctl cat service_name
systemctl show service_name --property=ExecStart

This monitoring framework provides comprehensive system oversight, enabling proactive issue detection and rapid troubleshooting through continuous monitoring, log analysis, and automated alerting mechanisms.