#!/usr/bin/env bash set -euo pipefail # Configuration CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5") TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}" # Colors for output GREEN='\033[0;32m' BLUE='\033[0;34m' YELLOW='\033[1;33m' RED='\033[0;31m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } # Check if talosconfig exists if [ ! -f "$TALOSCONFIG" ]; then log_error "TALOSCONFIG not found at: $TALOSCONFIG" log_info "Have you run ./bootstrap-cluster.sh yet?" exit 1 fi export TALOSCONFIG echo "==========================================" echo "Talos Cluster Status Check" echo "==========================================" echo "" # Check each node for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do echo "==================== Node: $NODE_IP ====================" # Check if node is accessible log_info "Checking if node is accessible..." if talosctl --nodes "$NODE_IP" version &> /dev/null; then log_success "Node is accessible" else log_error "Node is NOT accessible" echo "" continue fi # Check version echo "" log_info "Talos version:" talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version" # Check if in maintenance mode echo "" log_info "Checking if node is in maintenance mode..." if talosctl --nodes "$NODE_IP" get services &> /dev/null; then log_success "Node is OUT of maintenance mode (configured)" else log_error "Node is IN MAINTENANCE MODE - configuration not applied!" log_info "To apply config, run:" log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml" fi # Check services echo "" log_info "Service status:" talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services" # Check etcd status echo "" log_info "etcd status:" if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then log_success "etcd is RUNNING" talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" else log_warning "etcd is NOT running" talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet" fi # Check if etcd members exist echo "" log_info "etcd members:" if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then log_success "etcd members found" else log_warning "No etcd members - cluster needs bootstrap" fi echo "" done # Overall cluster status echo "==================== Overall Cluster Status ====================" # Check if any node has etcd running ETCD_RUNNING=false for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then ETCD_RUNNING=true break fi done echo "" if $ETCD_RUNNING; then log_success "Cluster appears to be bootstrapped (etcd running)" # Try to get kubeconfig echo "" log_info "Attempting to retrieve kubeconfig..." if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then log_success "Kubeconfig retrieved successfully" log_info "Kubernetes node status:" KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes" rm -f ./kubeconfig-test else log_warning "Could not retrieve kubeconfig" fi else log_warning "Cluster is NOT bootstrapped yet" log_info "" log_info "Next steps:" log_info "1. Ensure all nodes are out of maintenance mode (see checks above)" log_info "2. If nodes are in maintenance mode, apply configs:" for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml" done log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)" log_info "4. Bootstrap the cluster:" log_info " talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}" fi echo "" echo "=========================================="