From 6c292da5f1216a89aba49237a4700e134d4b09e9 Mon Sep 17 00:00:00 2001 From: 0xWheatyz Date: Wed, 4 Mar 2026 01:53:05 +0000 Subject: [PATCH] feat(scripts): add cluster bootstrap and status scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add automated scripts for Talos cluster management: bootstrap-cluster.sh: - Automated cluster bootstrap from scratch - Generates Talos secrets and machine configs - Applies configs to all nodes (10.0.1.3-5) - Bootstraps etcd and retrieves kubeconfig - Verifies cluster health check-cluster-status.sh: - Comprehensive cluster health diagnostics - Checks Talos services, etcd, and Kubernetes components - Displays node status and running pods - Useful for troubleshooting bootstrap issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- bootstrap-cluster.sh | 367 ++++++++++++++++++++++++++++++++++++++++ check-cluster-status.sh | 148 ++++++++++++++++ 2 files changed, 515 insertions(+) create mode 100755 bootstrap-cluster.sh create mode 100755 check-cluster-status.sh diff --git a/bootstrap-cluster.sh b/bootstrap-cluster.sh new file mode 100755 index 0000000..3d0a56d --- /dev/null +++ b/bootstrap-cluster.sh @@ -0,0 +1,367 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Configuration +CLUSTER_NAME="talos-cluster" +CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5") +CLUSTER_ENDPOINT="https://10.0.1.3:6443" +KUBERNETES_VERSION="1.33.0" +OUTPUT_DIR="testing1" + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check prerequisites +check_prerequisites() { + log_info "Checking prerequisites..." + + if ! command -v talosctl &> /dev/null; then + log_error "talosctl not found. Please run 'nix-shell' first." + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + log_error "kubectl not found. Please run 'nix-shell' first." + exit 1 + fi + + log_success "All prerequisites met" +} + +# Generate Talos secrets and configurations +generate_configs() { + log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}" + + # Create output directory if it doesn't exist + mkdir -p "${OUTPUT_DIR}" + + # Generate secrets + talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml" + log_success "Secrets generated" + + # Generate configs for all 3 control plane nodes + log_info "Generating machine configurations..." + + for i in "${!CONTROL_PLANE_NODES[@]}"; do + NODE_IP="${CONTROL_PLANE_NODES[$i]}" + log_info "Generating config for control plane node: ${NODE_IP}" + + talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \ + --with-secrets "${OUTPUT_DIR}/secrets.yaml" \ + --kubernetes-version="${KUBERNETES_VERSION}" \ + --output-types controlplane \ + --output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \ + --force \ + --config-patch @<(cat < /dev/null 2>&1; then + log_success "Node ${NODE_IP} is responding" + break + fi + + attempt=$((attempt + 1)) + sleep 5 + done + + if [ $attempt -eq $max_attempts ]; then + log_error "Node ${NODE_IP} did not become accessible in time" + exit 1 + fi + done + + # Wait for all nodes to be out of maintenance mode and services ready + log_info "Checking that all nodes are out of maintenance mode..." + + for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do + local max_attempts=60 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..." + + # Get service state - if this succeeds, node is configured + if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then + log_success "Node ${NODE_IP} is out of maintenance mode" + break + fi + + attempt=$((attempt + 1)) + sleep 5 + done + + if [ $attempt -eq $max_attempts ]; then + log_error "Node ${NODE_IP} did not exit maintenance mode" + log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services" + exit 1 + fi + done + + # Additional wait to ensure etcd service is ready for bootstrap + log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..." + sleep 10 + + log_success "All nodes are ready for bootstrapping" +} + +# Check if etcd is already bootstrapped +check_etcd_status() { + export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" + + log_info "Checking if etcd is already bootstrapped..." + + # Check if etcd service is running + if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then + log_warning "etcd is already running - cluster appears to be bootstrapped" + return 1 + fi + + return 0 +} + +# Bootstrap etcd on the first control plane node +bootstrap_cluster() { + log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}" + + export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" + + # Check if already bootstrapped + if ! check_etcd_status; then + log_warning "Skipping bootstrap as cluster is already bootstrapped" + return 0 + fi + + # Verify the node is ready for bootstrap + log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..." + if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then + log_warning "etcd members not yet initialized, proceeding with bootstrap..." + fi + + # Perform bootstrap + log_info "Running bootstrap command..." + if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then + log_success "Bootstrap command executed successfully" + else + log_error "Failed to bootstrap etcd" + log_error "This may be because:" + log_error " 1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)" + log_error " 2. The configuration was not properly applied" + log_error " 3. etcd is already bootstrapped" + exit 1 + fi + + # Wait for etcd to come up + log_info "Waiting for etcd to start..." + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then + log_success "etcd is running" + break + fi + + attempt=$((attempt + 1)) + sleep 5 + done + + if [ $attempt -eq $max_attempts ]; then + log_warning "etcd did not start in expected time, but continuing..." + fi + + log_info "Waiting for Kubernetes to initialize..." + sleep 30 +} + +# Retrieve kubeconfig +get_kubeconfig() { + log_info "Retrieving kubeconfig..." + + export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" + + local max_attempts=20 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..." + + if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then + log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig" + break + fi + + attempt=$((attempt + 1)) + sleep 10 + done + + if [ $attempt -eq $max_attempts ]; then + log_error "Failed to retrieve kubeconfig" + exit 1 + fi +} + +# Verify cluster health +verify_cluster() { + log_info "Verifying cluster health..." + + export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" + export KUBECONFIG="${OUTPUT_DIR}/kubeconfig" + + log_info "Checking Talos health..." + if talosctl health --wait-timeout 5m; then + log_success "Talos cluster is healthy" + else + log_warning "Talos health check reported issues" + fi + + log_info "Checking Kubernetes nodes..." + kubectl get nodes -o wide + + log_info "Checking system pods..." + kubectl get pods -A + + log_success "Cluster verification complete" +} + +# Print summary +print_summary() { + echo "" + echo "==========================================" + log_success "Talos Cluster Bootstrap Complete!" + echo "==========================================" + echo "" + echo "Cluster Name: ${CLUSTER_NAME}" + echo "Control Plane Nodes:" + for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do + echo " - ${NODE_IP}" + done + echo "" + echo "Configuration Files:" + echo " - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig" + echo " - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig" + echo "" + echo "To use the cluster, export these variables:" + echo " export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\"" + echo " export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\"" + echo "" + echo "Or run: nix-shell (which sets these automatically)" + echo "" + echo "Useful commands:" + echo " talosctl health" + echo " kubectl get nodes" + echo " kubectl get pods -A" + echo "==========================================" +} + +# Main execution +main() { + log_info "Starting Talos Cluster Bootstrap" + log_info "Cluster: ${CLUSTER_NAME}" + log_info "Nodes: ${CONTROL_PLANE_NODES[*]}" + echo "" + + check_prerequisites + generate_configs + apply_configs + wait_for_nodes + bootstrap_cluster + get_kubeconfig + verify_cluster + print_summary +} + +# Run main function +main diff --git a/check-cluster-status.sh b/check-cluster-status.sh new file mode 100755 index 0000000..f0dff10 --- /dev/null +++ b/check-cluster-status.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Configuration +CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5") +TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}" + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if talosconfig exists +if [ ! -f "$TALOSCONFIG" ]; then + log_error "TALOSCONFIG not found at: $TALOSCONFIG" + log_info "Have you run ./bootstrap-cluster.sh yet?" + exit 1 +fi + +export TALOSCONFIG + +echo "==========================================" +echo "Talos Cluster Status Check" +echo "==========================================" +echo "" + +# Check each node +for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do + echo "==================== Node: $NODE_IP ====================" + + # Check if node is accessible + log_info "Checking if node is accessible..." + if talosctl --nodes "$NODE_IP" version &> /dev/null; then + log_success "Node is accessible" + else + log_error "Node is NOT accessible" + echo "" + continue + fi + + # Check version + echo "" + log_info "Talos version:" + talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version" + + # Check if in maintenance mode + echo "" + log_info "Checking if node is in maintenance mode..." + if talosctl --nodes "$NODE_IP" get services &> /dev/null; then + log_success "Node is OUT of maintenance mode (configured)" + else + log_error "Node is IN MAINTENANCE MODE - configuration not applied!" + log_info "To apply config, run:" + log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml" + fi + + # Check services + echo "" + log_info "Service status:" + talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services" + + # Check etcd status + echo "" + log_info "etcd status:" + if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then + log_success "etcd is RUNNING" + talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" + else + log_warning "etcd is NOT running" + talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet" + fi + + # Check if etcd members exist + echo "" + log_info "etcd members:" + if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then + log_success "etcd members found" + else + log_warning "No etcd members - cluster needs bootstrap" + fi + + echo "" +done + +# Overall cluster status +echo "==================== Overall Cluster Status ====================" + +# Check if any node has etcd running +ETCD_RUNNING=false +for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do + if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then + ETCD_RUNNING=true + break + fi +done + +echo "" +if $ETCD_RUNNING; then + log_success "Cluster appears to be bootstrapped (etcd running)" + + # Try to get kubeconfig + echo "" + log_info "Attempting to retrieve kubeconfig..." + if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then + log_success "Kubeconfig retrieved successfully" + + log_info "Kubernetes node status:" + KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes" + + rm -f ./kubeconfig-test + else + log_warning "Could not retrieve kubeconfig" + fi +else + log_warning "Cluster is NOT bootstrapped yet" + log_info "" + log_info "Next steps:" + log_info "1. Ensure all nodes are out of maintenance mode (see checks above)" + log_info "2. If nodes are in maintenance mode, apply configs:" + for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do + log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml" + done + log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)" + log_info "4. Bootstrap the cluster:" + log_info " talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}" +fi + +echo "" +echo "=========================================="