Skip to content

Linux System Monitoring and Troubleshooting

Overview

System monitoring involves tracking performance metrics, analyzing logs, and setting up alerts to maintain system health and quickly identify issues.

Performance Monitoring Tools

htop - Interactive Process Viewer

# Install and use htop
sudo apt install htop
htop

# Key htop features:
# - Real-time process monitoring
# - CPU/memory usage per core
# - Process tree view (F5)
# - Kill processes (F9)
# - Search processes (F3)
# - Filter by user (u)

# Configure htop colors and layout
# Setup -> Display options -> Detailed CPU time

System Resource Monitoring

# CPU monitoring
top                           # Classic process monitor
iostat 1                      # CPU and I/O statistics
vmstat 1                      # Virtual memory stats
sar -u 1 10                   # CPU utilization over time
mpstat 1                      # Multi-processor statistics

# Memory monitoring
free -h                       # Memory usage
watch -n 1 'free -h'         # Continuous memory monitoring
cat /proc/meminfo             # Detailed memory information
pmap -x PID                   # Process memory mapping

# Disk I/O monitoring
iotop                         # I/O usage by process
iotop -o                      # Only show active I/O
iostat -x 1                   # Extended I/O statistics
atop                          # Advanced system monitor

Network Monitoring

# Network traffic monitoring
iftop                         # Interface bandwidth usage
nethogs                       # Network usage by process
vnstat                        # Network traffic statistics
ss -tuln                      # Socket statistics
netstat -i                    # Interface statistics

# Real-time network monitoring
watch -n 1 'ss -tuln | grep :80'
tcpdump -i eth0               # Packet capture
wireshark                     # GUI packet analyzer

# Network performance testing
iperf3 -s                     # Server mode
iperf3 -c server_ip           # Client test
ping -c 10 8.8.8.8           # Latency test
traceroute google.com         # Route tracing

Log Management and Analysis

System Logs Location

# Main log directories
/var/log/                     # Primary log directory
/var/log/syslog              # General system messages
/var/log/auth.log            # Authentication logs
/var/log/kern.log            # Kernel messages
/var/log/mail.log            # Mail server logs
/var/log/apache2/            # Apache web server logs
/var/log/nginx/              # Nginx web server logs

# Application-specific logs
/var/log/mysql/              # MySQL logs
/var/log/postgresql/         # PostgreSQL logs
/var/log/fail2ban.log        # Fail2ban logs

Log Analysis with journalctl

# Basic journalctl usage
journalctl                    # All journal entries
journalctl -f                 # Follow logs (tail -f style)
journalctl -n 50              # Last 50 entries
journalctl --since "1 hour ago"  # Last hour
journalctl --since "2024-01-01"  # Since specific date

# Filter by service
journalctl -u ssh             # SSH service logs
journalctl -u apache2         # Apache service logs
journalctl -u nginx           # Nginx service logs

# Filter by priority
journalctl -p err             # Error level and above
journalctl -p warning         # Warning level and above
journalctl -p crit            # Critical level and above

# Advanced filtering
journalctl _COMM=sshd         # SSH daemon messages
journalctl _UID=1000          # Messages from specific user
journalctl -k                 # Kernel messages only
journalctl --list-boots       # Available boot sessions

Log Analysis Scripts

#!/bin/bash
# Log analysis script for security events

LOG_FILE="/var/log/auth.log"
REPORT_FILE="/tmp/security-report-$(date +%Y%m%d).txt"

echo "Security Log Analysis - $(date)" > "$REPORT_FILE"
echo "=================================" >> "$REPORT_FILE"

# Failed SSH attempts
echo "Failed SSH Login Attempts:" >> "$REPORT_FILE"
grep "Failed password" "$LOG_FILE" | tail -20 >> "$REPORT_FILE"

# Successful logins
echo -e "\nSuccessful SSH Logins:" >> "$REPORT_FILE"
grep "Accepted password" "$LOG_FILE" | tail -10 >> "$REPORT_FILE"

# Sudo usage
echo -e "\nSudo Commands:" >> "$REPORT_FILE"
grep "sudo:" "$LOG_FILE" | tail -10 >> "$REPORT_FILE"

# Summary statistics
FAILED_COUNT=$(grep -c "Failed password" "$LOG_FILE")
SUCCESS_COUNT=$(grep -c "Accepted password" "$LOG_FILE")
SUDO_COUNT=$(grep -c "sudo:" "$LOG_FILE")

echo -e "\nSummary:" >> "$REPORT_FILE"
echo "Failed logins: $FAILED_COUNT" >> "$REPORT_FILE"
echo "Successful logins: $SUCCESS_COUNT" >> "$REPORT_FILE"
echo "Sudo commands: $SUDO_COUNT" >> "$REPORT_FILE"

cat "$REPORT_FILE"

Alerting and Notification Systems

Email Alerts Setup

# Install mail utilities
sudo apt install mailutils postfix

# Configure postfix for local delivery
sudo dpkg-reconfigure postfix

# Test email functionality
echo "Test message" | mail -s "Test Subject" user@example.com

# Alert script for high CPU usage
#!/bin/bash
CPU_THRESHOLD=80
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')

if (( $(echo "$CPU_USAGE > $CPU_THRESHOLD" | bc -l) )); then
    echo "High CPU usage detected: ${CPU_USAGE}%" | \
    mail -s "CPU Alert - $(hostname)" admin@example.com
fi

Advanced Monitoring with Nagios

# Install Nagios
sudo apt update
sudo apt install nagios4 nagios-plugins-contrib

# Start Nagios
sudo systemctl start nagios4
sudo systemctl enable nagios4

# Access web interface at http://localhost/nagios4

# Custom Nagios check script
#!/bin/bash
# /usr/local/nagios/libexec/check_disk_usage.sh

PARTITION="$1"
WARNING="$2"
CRITICAL="$3"

if [ $# -ne 3 ]; then
    echo "Usage: $0 <partition> <warning%> <critical%>"
    exit 3
fi

USAGE=$(df "$PARTITION" | awk 'NR==2 {print $5}' | sed 's/%//')

if [ "$USAGE" -ge "$CRITICAL" ]; then
    echo "CRITICAL - Disk usage ${USAGE}% on $PARTITION"
    exit 2
elif [ "$USAGE" -ge "$WARNING" ]; then
    echo "WARNING - Disk usage ${USAGE}% on $PARTITION"
    exit 1
else
    echo "OK - Disk usage ${USAGE}% on $PARTITION"
    exit 0
fi

Prometheus and Grafana Setup

# Install Prometheus
sudo apt install prometheus

# Basic Prometheus configuration (/etc/prometheus/prometheus.yml)
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100']

# Install Node Exporter for system metrics
sudo apt install prometheus-node-exporter

# Install Grafana
sudo apt install software-properties-common
sudo add-apt-repository "deb https://packages.grafana.com/oss/deb stable main"
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
sudo apt update
sudo apt install grafana

# Start services
sudo systemctl start prometheus
sudo systemctl start prometheus-node-exporter
sudo systemctl start grafana-server
sudo systemctl enable grafana-server

# Access Grafana at http://localhost:3000 (admin/admin)

System Health Monitoring

Automated Health Checks

#!/bin/bash
# Comprehensive system health check

HEALTH_REPORT="/tmp/system-health-$(date +%Y%m%d-%H%M).txt"

echo "System Health Report - $(date)" > "$HEALTH_REPORT"
echo "======================================" >> "$HEALTH_REPORT"

# Check system load
LOAD=$(uptime | awk '{print $(NF-2)}' | sed 's/,//')
echo "System Load: $LOAD" >> "$HEALTH_REPORT"

# Check memory usage
MEMORY=$(free | awk '/^Mem:/ {printf "%.1f", $3/$2 * 100}')
echo "Memory Usage: ${MEMORY}%" >> "$HEALTH_REPORT"

# Check disk usage
echo "Disk Usage:" >> "$HEALTH_REPORT"
df -h | grep -E '^/dev/' >> "$HEALTH_REPORT"

# Check running services
echo -e "\nCritical Services Status:" >> "$HEALTH_REPORT"
SERVICES=("ssh" "apache2" "mysql" "nginx")
for service in "${SERVICES[@]}"; do
    if systemctl is-active --quiet "$service"; then
        echo "$service: Running" >> "$HEALTH_REPORT"
    else
        echo "$service: Stopped" >> "$HEALTH_REPORT"
    fi
done

# Check network connectivity
echo -e "\nNetwork Connectivity:" >> "$HEALTH_REPORT"
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
    echo "Internet: Connected" >> "$HEALTH_REPORT"
else
    echo "Internet: Disconnected" >> "$HEALTH_REPORT"
fi

# Check for errors in logs
echo -e "\nRecent Errors:" >> "$HEALTH_REPORT"
journalctl -p err --since "1 hour ago" --no-pager | tail -5 >> "$HEALTH_REPORT"

cat "$HEALTH_REPORT"

# Send alert if critical issues found
CRITICAL_ISSUES=$(grep -c "Disconnected\|Stopped\|Error" "$HEALTH_REPORT")
if [ "$CRITICAL_ISSUES" -gt 0 ]; then
    mail -s "System Health Alert - $(hostname)" admin@example.com < "$HEALTH_REPORT"
fi

Process Monitoring

# Monitor specific processes
#!/bin/bash
# Process monitoring script

PROCESSES=("apache2" "mysql" "nginx" "ssh")
LOG_FILE="/var/log/process-monitor.log"

for process in "${PROCESSES[@]}"; do
    if ! pgrep "$process" > /dev/null; then
        echo "$(date): $process is not running" >> "$LOG_FILE"

        # Attempt to restart service
        if systemctl start "$process"; then
            echo "$(date): Successfully restarted $process" >> "$LOG_FILE"
        else
            echo "$(date): Failed to restart $process" >> "$LOG_FILE"
            echo "$process failed to restart on $(hostname)" | \
            mail -s "Service Restart Failed" admin@example.com
        fi
    fi
done

# Monitor process resource usage
ps aux --sort=-%cpu | head -10 > /tmp/top-cpu-processes.txt
ps aux --sort=-%mem | head -10 > /tmp/top-memory-processes.txt

Real-time Monitoring Dashboard

Simple Web Dashboard

#!/bin/bash
# Generate HTML dashboard

DASHBOARD_FILE="/var/www/html/dashboard.html"

cat > "$DASHBOARD_FILE" << EOF
<!DOCTYPE html>
<html>
<head>
    <title>System Dashboard</title>
    <meta http-equiv="refresh" content="30">
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .metric { background: #f0f0f0; padding: 10px; margin: 10px 0; border-radius: 5px; }
        .critical { background: #ffcccc; }
        .warning { background: #ffffcc; }
        .ok { background: #ccffcc; }
    </style>
</head>
<body>
    <h1>System Dashboard - $(hostname)</h1>
    <p>Last Updated: $(date)</p>

    <div class="metric">
        <h3>System Load</h3>
        <p>$(uptime)</p>
    </div>

    <div class="metric">
        <h3>Memory Usage</h3>
        <pre>$(free -h)</pre>
    </div>

    <div class="metric">
        <h3>Disk Usage</h3>
        <pre>$(df -h | grep -E '^/dev/')</pre>
    </div>

    <div class="metric">
        <h3>Network Interfaces</h3>
        <pre>$(ip addr show | grep -E '^[0-9]+:|inet ')</pre>
    </div>

    <div class="metric">
        <h3>Top Processes (CPU)</h3>
        <pre>$(ps aux --sort=-%cpu | head -10)</pre>
    </div>

</body>
</html>
EOF

echo "Dashboard generated at $DASHBOARD_FILE"

Troubleshooting Common Issues

High CPU Usage Investigation

# Identify high CPU processes
top -bn1 | head -20
ps aux --sort=-%cpu | head -10

# Monitor CPU usage over time
sar -u 1 60 > cpu_usage.log

# Check for CPU-intensive scripts
ps aux | grep -E "(python|php|perl|bash)" | grep -v grep

# Kill runaway processes
pkill -f suspicious_process_name

# Check system load history
sar -q | tail -20

Memory Issues Diagnosis

# Check memory usage
free -h
cat /proc/meminfo

# Find memory-hungry processes
ps aux --sort=-%mem | head -10

# Check for memory leaks
valgrind --tool=memcheck --leak-check=full ./program

# Clear cache if needed (emergency only)
sync && echo 3 > /proc/sys/vm/drop_caches

# Check swap usage
swapon --show
cat /proc/swaps

Disk Space Problems

# Find large files and directories
du -sh /* | sort -h
find / -type f -size +100M 2>/dev/null
ncdu /                        # Interactive disk usage analyzer

# Clean up common locations
sudo apt autoremove           # Remove unused packages
sudo apt autoclean           # Clean package cache
sudo journalctl --vacuum-time=7d  # Clean old journal entries

# Check for deleted files still open
lsof +L1                      # Files deleted but still open

# Monitor disk I/O
iotop -o                      # Only active I/O
iostat -x 1                   # Extended statistics

Network Connectivity Issues

# Basic connectivity tests
ping -c 4 8.8.8.8            # Test external connectivity
ping -c 4 gateway_ip          # Test gateway connectivity
traceroute destination        # Trace network path
mtr destination               # Combined ping/traceroute

# Check network configuration
ip addr show                  # IP addresses
ip route show                 # Routing table
ss -tuln                      # Open ports
netstat -rn                   # Routing table (legacy)

# DNS troubleshooting
nslookup domain.com
dig domain.com
systemd-resolve --status      # systemd-resolved status

# Firewall check
sudo iptables -L -n           # Firewall rules
sudo ufw status               # UFW status

Service Troubleshooting

# Service status and logs
systemctl status service_name
journalctl -u service_name -f
journalctl -u service_name --since "1 hour ago"

# Service dependencies
systemctl list-dependencies service_name
systemctl show service_name

# Restart and reload services
sudo systemctl restart service_name
sudo systemctl reload service_name
sudo systemctl daemon-reload

# Check service configuration
systemctl cat service_name
systemctl show service_name --property=ExecStart

This monitoring framework provides comprehensive system oversight, enabling proactive issue detection and rapid troubleshooting through continuous monitoring, log analysis, and automated alerting mechanisms.