Add automated scripts for Talos cluster management: bootstrap-cluster.sh: - Automated cluster bootstrap from scratch - Generates Talos secrets and machine configs - Applies configs to all nodes (10.0.1.3-5) - Bootstraps etcd and retrieves kubeconfig - Verifies cluster health check-cluster-status.sh: - Comprehensive cluster health diagnostics - Checks Talos services, etcd, and Kubernetes components - Displays node status and running pods - Useful for troubleshooting bootstrap issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
149 lines
4.3 KiB
Bash
Executable File
149 lines
4.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
|
|
TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}"
|
|
|
|
# Colors for output
|
|
GREEN='\033[0;32m'
|
|
BLUE='\033[0;34m'
|
|
YELLOW='\033[1;33m'
|
|
RED='\033[0;31m'
|
|
NC='\033[0m'
|
|
|
|
log_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_success() {
|
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
}
|
|
|
|
log_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
# Check if talosconfig exists
|
|
if [ ! -f "$TALOSCONFIG" ]; then
|
|
log_error "TALOSCONFIG not found at: $TALOSCONFIG"
|
|
log_info "Have you run ./bootstrap-cluster.sh yet?"
|
|
exit 1
|
|
fi
|
|
|
|
export TALOSCONFIG
|
|
|
|
echo "=========================================="
|
|
echo "Talos Cluster Status Check"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# Check each node
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
echo "==================== Node: $NODE_IP ===================="
|
|
|
|
# Check if node is accessible
|
|
log_info "Checking if node is accessible..."
|
|
if talosctl --nodes "$NODE_IP" version &> /dev/null; then
|
|
log_success "Node is accessible"
|
|
else
|
|
log_error "Node is NOT accessible"
|
|
echo ""
|
|
continue
|
|
fi
|
|
|
|
# Check version
|
|
echo ""
|
|
log_info "Talos version:"
|
|
talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version"
|
|
|
|
# Check if in maintenance mode
|
|
echo ""
|
|
log_info "Checking if node is in maintenance mode..."
|
|
if talosctl --nodes "$NODE_IP" get services &> /dev/null; then
|
|
log_success "Node is OUT of maintenance mode (configured)"
|
|
else
|
|
log_error "Node is IN MAINTENANCE MODE - configuration not applied!"
|
|
log_info "To apply config, run:"
|
|
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
|
|
fi
|
|
|
|
# Check services
|
|
echo ""
|
|
log_info "Service status:"
|
|
talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services"
|
|
|
|
# Check etcd status
|
|
echo ""
|
|
log_info "etcd status:"
|
|
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
|
log_success "etcd is RUNNING"
|
|
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE"
|
|
else
|
|
log_warning "etcd is NOT running"
|
|
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet"
|
|
fi
|
|
|
|
# Check if etcd members exist
|
|
echo ""
|
|
log_info "etcd members:"
|
|
if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then
|
|
log_success "etcd members found"
|
|
else
|
|
log_warning "No etcd members - cluster needs bootstrap"
|
|
fi
|
|
|
|
echo ""
|
|
done
|
|
|
|
# Overall cluster status
|
|
echo "==================== Overall Cluster Status ===================="
|
|
|
|
# Check if any node has etcd running
|
|
ETCD_RUNNING=false
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
|
ETCD_RUNNING=true
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
if $ETCD_RUNNING; then
|
|
log_success "Cluster appears to be bootstrapped (etcd running)"
|
|
|
|
# Try to get kubeconfig
|
|
echo ""
|
|
log_info "Attempting to retrieve kubeconfig..."
|
|
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then
|
|
log_success "Kubeconfig retrieved successfully"
|
|
|
|
log_info "Kubernetes node status:"
|
|
KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes"
|
|
|
|
rm -f ./kubeconfig-test
|
|
else
|
|
log_warning "Could not retrieve kubeconfig"
|
|
fi
|
|
else
|
|
log_warning "Cluster is NOT bootstrapped yet"
|
|
log_info ""
|
|
log_info "Next steps:"
|
|
log_info "1. Ensure all nodes are out of maintenance mode (see checks above)"
|
|
log_info "2. If nodes are in maintenance mode, apply configs:"
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
|
|
done
|
|
log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)"
|
|
log_info "4. Bootstrap the cluster:"
|
|
log_info " talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}"
|
|
fi
|
|
|
|
echo ""
|
|
echo "=========================================="
|