Talos/check-cluster-status.sh

#!/usr/bin/env bash

set -euo pipefail

# Configuration
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}"

# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'

log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

log_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Check if talosconfig exists
if [ ! -f "$TALOSCONFIG" ]; then
    log_error "TALOSCONFIG not found at: $TALOSCONFIG"
    log_info "Have you run ./bootstrap-cluster.sh yet?"
    exit 1
fi

export TALOSCONFIG

echo "=========================================="
echo "Talos Cluster Status Check"
echo "=========================================="
echo ""

# Check each node
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
    echo "==================== Node: $NODE_IP ===================="

    # Check if node is accessible
    log_info "Checking if node is accessible..."
    if talosctl --nodes "$NODE_IP" version &> /dev/null; then
        log_success "Node is accessible"
    else
        log_error "Node is NOT accessible"
        echo ""
        continue
    fi

    # Check version
    echo ""
    log_info "Talos version:"
    talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version"

    # Check if in maintenance mode
    echo ""
    log_info "Checking if node is in maintenance mode..."
    if talosctl --nodes "$NODE_IP" get services &> /dev/null; then
        log_success "Node is OUT of maintenance mode (configured)"
    else
        log_error "Node is IN MAINTENANCE MODE - configuration not applied!"
        log_info "To apply config, run:"
        log_info "  talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
    fi

    # Check services
    echo ""
    log_info "Service status:"
    talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services"

    # Check etcd status
    echo ""
    log_info "etcd status:"
    if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
        log_success "etcd is RUNNING"
        talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE"
    else
        log_warning "etcd is NOT running"
        talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet"
    fi

    # Check if etcd members exist
    echo ""
    log_info "etcd members:"
    if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then
        log_success "etcd members found"
    else
        log_warning "No etcd members - cluster needs bootstrap"
    fi

    echo ""
done

# Overall cluster status
echo "==================== Overall Cluster Status ===================="

# Check if any node has etcd running
ETCD_RUNNING=false
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
    if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
        ETCD_RUNNING=true
        break
    fi
done

echo ""
if $ETCD_RUNNING; then
    log_success "Cluster appears to be bootstrapped (etcd running)"

    # Try to get kubeconfig
    echo ""
    log_info "Attempting to retrieve kubeconfig..."
    if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then
        log_success "Kubeconfig retrieved successfully"

        log_info "Kubernetes node status:"
        KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes"

        rm -f ./kubeconfig-test
    else
        log_warning "Could not retrieve kubeconfig"
    fi
else
    log_warning "Cluster is NOT bootstrapped yet"
    log_info ""
    log_info "Next steps:"
    log_info "1. Ensure all nodes are out of maintenance mode (see checks above)"
    log_info "2. If nodes are in maintenance mode, apply configs:"
    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
        log_info "   talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
    done
    log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)"
    log_info "4. Bootstrap the cluster:"
    log_info "   talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}"
fi

echo ""
echo "=========================================="