Talos/check-cluster-status.sh
0xWheatyz 6c292da5f1 feat(scripts): add cluster bootstrap and status scripts
Add automated scripts for Talos cluster management:

bootstrap-cluster.sh:
- Automated cluster bootstrap from scratch
- Generates Talos secrets and machine configs
- Applies configs to all nodes (10.0.1.3-5)
- Bootstraps etcd and retrieves kubeconfig
- Verifies cluster health

check-cluster-status.sh:
- Comprehensive cluster health diagnostics
- Checks Talos services, etcd, and Kubernetes components
- Displays node status and running pods
- Useful for troubleshooting bootstrap issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-04 01:53:05 +00:00

149 lines
4.3 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Configuration
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}"
# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Check if talosconfig exists
if [ ! -f "$TALOSCONFIG" ]; then
log_error "TALOSCONFIG not found at: $TALOSCONFIG"
log_info "Have you run ./bootstrap-cluster.sh yet?"
exit 1
fi
export TALOSCONFIG
echo "=========================================="
echo "Talos Cluster Status Check"
echo "=========================================="
echo ""
# Check each node
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
echo "==================== Node: $NODE_IP ===================="
# Check if node is accessible
log_info "Checking if node is accessible..."
if talosctl --nodes "$NODE_IP" version &> /dev/null; then
log_success "Node is accessible"
else
log_error "Node is NOT accessible"
echo ""
continue
fi
# Check version
echo ""
log_info "Talos version:"
talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version"
# Check if in maintenance mode
echo ""
log_info "Checking if node is in maintenance mode..."
if talosctl --nodes "$NODE_IP" get services &> /dev/null; then
log_success "Node is OUT of maintenance mode (configured)"
else
log_error "Node is IN MAINTENANCE MODE - configuration not applied!"
log_info "To apply config, run:"
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
fi
# Check services
echo ""
log_info "Service status:"
talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services"
# Check etcd status
echo ""
log_info "etcd status:"
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
log_success "etcd is RUNNING"
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE"
else
log_warning "etcd is NOT running"
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet"
fi
# Check if etcd members exist
echo ""
log_info "etcd members:"
if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then
log_success "etcd members found"
else
log_warning "No etcd members - cluster needs bootstrap"
fi
echo ""
done
# Overall cluster status
echo "==================== Overall Cluster Status ===================="
# Check if any node has etcd running
ETCD_RUNNING=false
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
ETCD_RUNNING=true
break
fi
done
echo ""
if $ETCD_RUNNING; then
log_success "Cluster appears to be bootstrapped (etcd running)"
# Try to get kubeconfig
echo ""
log_info "Attempting to retrieve kubeconfig..."
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then
log_success "Kubeconfig retrieved successfully"
log_info "Kubernetes node status:"
KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes"
rm -f ./kubeconfig-test
else
log_warning "Could not retrieve kubeconfig"
fi
else
log_warning "Cluster is NOT bootstrapped yet"
log_info ""
log_info "Next steps:"
log_info "1. Ensure all nodes are out of maintenance mode (see checks above)"
log_info "2. If nodes are in maintenance mode, apply configs:"
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
done
log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)"
log_info "4. Bootstrap the cluster:"
log_info " talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}"
fi
echo ""
echo "=========================================="