From 6c292da5f1216a89aba49237a4700e134d4b09e9 Mon Sep 17 00:00:00 2001
From: 0xWheatyz <wyatt@leeworks.dev>
Date: Wed, 4 Mar 2026 01:53:05 +0000
Subject: [PATCH] feat(scripts): add cluster bootstrap and status scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add automated scripts for Talos cluster management:

bootstrap-cluster.sh:
- Automated cluster bootstrap from scratch
- Generates Talos secrets and machine configs
- Applies configs to all nodes (10.0.1.3-5)
- Bootstraps etcd and retrieves kubeconfig
- Verifies cluster health

check-cluster-status.sh:
- Comprehensive cluster health diagnostics
- Checks Talos services, etcd, and Kubernetes components
- Displays node status and running pods
- Useful for troubleshooting bootstrap issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 bootstrap-cluster.sh    | 367 ++++++++++++++++++++++++++++++++++++++++
 check-cluster-status.sh | 148 ++++++++++++++++
 2 files changed, 515 insertions(+)
 create mode 100755 bootstrap-cluster.sh
 create mode 100755 check-cluster-status.sh

diff --git a/bootstrap-cluster.sh b/bootstrap-cluster.sh
new file mode 100755
index 0000000..3d0a56d
--- /dev/null
+++ b/bootstrap-cluster.sh
@@ -0,0 +1,367 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Configuration
+CLUSTER_NAME="talos-cluster"
+CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
+CLUSTER_ENDPOINT="https://10.0.1.3:6443"
+KUBERNETES_VERSION="1.33.0"
+OUTPUT_DIR="testing1"
+
+# Colors for output
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check prerequisites
+check_prerequisites() {
+    log_info "Checking prerequisites..."
+
+    if ! command -v talosctl &> /dev/null; then
+        log_error "talosctl not found. Please run 'nix-shell' first."
+        exit 1
+    fi
+
+    if ! command -v kubectl &> /dev/null; then
+        log_error "kubectl not found. Please run 'nix-shell' first."
+        exit 1
+    fi
+
+    log_success "All prerequisites met"
+}
+
+# Generate Talos secrets and configurations
+generate_configs() {
+    log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}"
+
+    # Create output directory if it doesn't exist
+    mkdir -p "${OUTPUT_DIR}"
+
+    # Generate secrets
+    talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml"
+    log_success "Secrets generated"
+
+    # Generate configs for all 3 control plane nodes
+    log_info "Generating machine configurations..."
+
+    for i in "${!CONTROL_PLANE_NODES[@]}"; do
+        NODE_IP="${CONTROL_PLANE_NODES[$i]}"
+        log_info "Generating config for control plane node: ${NODE_IP}"
+
+        talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
+            --with-secrets "${OUTPUT_DIR}/secrets.yaml" \
+            --kubernetes-version="${KUBERNETES_VERSION}" \
+            --output-types controlplane \
+            --output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \
+            --force \
+            --config-patch @<(cat <<EOF
+machine:
+  network:
+    hostname: cp-${i}
+  certSANs:
+    - ${NODE_IP}
+    - 10.0.1.3
+    - 10.0.1.4
+    - 10.0.1.5
+cluster:
+  allowSchedulingOnControlPlanes: true
+  controlPlane:
+    endpoint: ${CLUSTER_ENDPOINT}
+EOF
+)
+    done
+
+    # Generate talosconfig
+    talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
+        --with-secrets "${OUTPUT_DIR}/secrets.yaml" \
+        --output-types talosconfig \
+        --force \
+        --output "${OUTPUT_DIR}/.talosconfig"
+
+    # Configure talosctl to use the new config
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    # Add all endpoints to talosconfig
+    talosctl config endpoint "${CONTROL_PLANE_NODES[@]}"
+    talosctl config node "${CONTROL_PLANE_NODES[0]}"
+
+    log_success "All configurations generated in ${OUTPUT_DIR}/"
+}
+
+# Apply configurations to nodes
+apply_configs() {
+    log_info "Applying configurations to nodes..."
+
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+        log_info "Applying config to ${NODE_IP}..."
+
+        # Apply config with --insecure flag for initial bootstrap
+        if talosctl apply-config \
+            --insecure \
+            --nodes "${NODE_IP}" \
+            --file "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml"; then
+            log_success "Configuration applied to ${NODE_IP}"
+        else
+            log_error "Failed to apply configuration to ${NODE_IP}"
+            exit 1
+        fi
+
+        # Brief pause between nodes
+        sleep 2
+    done
+
+    log_success "Configurations applied to all nodes"
+}
+
+# Wait for nodes to be ready
+wait_for_nodes() {
+    log_info "Waiting for nodes to reboot and be ready..."
+
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    # Wait for each node to be accessible
+    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+        log_info "Waiting for node ${NODE_IP} to be accessible..."
+
+        local max_attempts=60
+        local attempt=0
+
+        while [ $attempt -lt $max_attempts ]; do
+            if talosctl --nodes "${NODE_IP}" version &> /dev/null 2>&1; then
+                log_success "Node ${NODE_IP} is responding"
+                break
+            fi
+
+            attempt=$((attempt + 1))
+            sleep 5
+        done
+
+        if [ $attempt -eq $max_attempts ]; then
+            log_error "Node ${NODE_IP} did not become accessible in time"
+            exit 1
+        fi
+    done
+
+    # Wait for all nodes to be out of maintenance mode and services ready
+    log_info "Checking that all nodes are out of maintenance mode..."
+
+    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+        local max_attempts=60
+        local attempt=0
+
+        while [ $attempt -lt $max_attempts ]; do
+            log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..."
+
+            # Get service state - if this succeeds, node is configured
+            if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then
+                log_success "Node ${NODE_IP} is out of maintenance mode"
+                break
+            fi
+
+            attempt=$((attempt + 1))
+            sleep 5
+        done
+
+        if [ $attempt -eq $max_attempts ]; then
+            log_error "Node ${NODE_IP} did not exit maintenance mode"
+            log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services"
+            exit 1
+        fi
+    done
+
+    # Additional wait to ensure etcd service is ready for bootstrap
+    log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..."
+    sleep 10
+
+    log_success "All nodes are ready for bootstrapping"
+}
+
+# Check if etcd is already bootstrapped
+check_etcd_status() {
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    log_info "Checking if etcd is already bootstrapped..."
+
+    # Check if etcd service is running
+    if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
+        log_warning "etcd is already running - cluster appears to be bootstrapped"
+        return 1
+    fi
+
+    return 0
+}
+
+# Bootstrap etcd on the first control plane node
+bootstrap_cluster() {
+    log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}"
+
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    # Check if already bootstrapped
+    if ! check_etcd_status; then
+        log_warning "Skipping bootstrap as cluster is already bootstrapped"
+        return 0
+    fi
+
+    # Verify the node is ready for bootstrap
+    log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..."
+    if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then
+        log_warning "etcd members not yet initialized, proceeding with bootstrap..."
+    fi
+
+    # Perform bootstrap
+    log_info "Running bootstrap command..."
+    if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then
+        log_success "Bootstrap command executed successfully"
+    else
+        log_error "Failed to bootstrap etcd"
+        log_error "This may be because:"
+        log_error "  1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)"
+        log_error "  2. The configuration was not properly applied"
+        log_error "  3. etcd is already bootstrapped"
+        exit 1
+    fi
+
+    # Wait for etcd to come up
+    log_info "Waiting for etcd to start..."
+    local max_attempts=30
+    local attempt=0
+
+    while [ $attempt -lt $max_attempts ]; do
+        if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
+            log_success "etcd is running"
+            break
+        fi
+
+        attempt=$((attempt + 1))
+        sleep 5
+    done
+
+    if [ $attempt -eq $max_attempts ]; then
+        log_warning "etcd did not start in expected time, but continuing..."
+    fi
+
+    log_info "Waiting for Kubernetes to initialize..."
+    sleep 30
+}
+
+# Retrieve kubeconfig
+get_kubeconfig() {
+    log_info "Retrieving kubeconfig..."
+
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+
+    local max_attempts=20
+    local attempt=0
+
+    while [ $attempt -lt $max_attempts ]; do
+        log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..."
+
+        if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then
+            log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig"
+            break
+        fi
+
+        attempt=$((attempt + 1))
+        sleep 10
+    done
+
+    if [ $attempt -eq $max_attempts ]; then
+        log_error "Failed to retrieve kubeconfig"
+        exit 1
+    fi
+}
+
+# Verify cluster health
+verify_cluster() {
+    log_info "Verifying cluster health..."
+
+    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
+    export KUBECONFIG="${OUTPUT_DIR}/kubeconfig"
+
+    log_info "Checking Talos health..."
+    if talosctl health --wait-timeout 5m; then
+        log_success "Talos cluster is healthy"
+    else
+        log_warning "Talos health check reported issues"
+    fi
+
+    log_info "Checking Kubernetes nodes..."
+    kubectl get nodes -o wide
+
+    log_info "Checking system pods..."
+    kubectl get pods -A
+
+    log_success "Cluster verification complete"
+}
+
+# Print summary
+print_summary() {
+    echo ""
+    echo "=========================================="
+    log_success "Talos Cluster Bootstrap Complete!"
+    echo "=========================================="
+    echo ""
+    echo "Cluster Name: ${CLUSTER_NAME}"
+    echo "Control Plane Nodes:"
+    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+        echo "  - ${NODE_IP}"
+    done
+    echo ""
+    echo "Configuration Files:"
+    echo "  - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig"
+    echo "  - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig"
+    echo ""
+    echo "To use the cluster, export these variables:"
+    echo "  export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\""
+    echo "  export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\""
+    echo ""
+    echo "Or run: nix-shell (which sets these automatically)"
+    echo ""
+    echo "Useful commands:"
+    echo "  talosctl health"
+    echo "  kubectl get nodes"
+    echo "  kubectl get pods -A"
+    echo "=========================================="
+}
+
+# Main execution
+main() {
+    log_info "Starting Talos Cluster Bootstrap"
+    log_info "Cluster: ${CLUSTER_NAME}"
+    log_info "Nodes: ${CONTROL_PLANE_NODES[*]}"
+    echo ""
+
+    check_prerequisites
+    generate_configs
+    apply_configs
+    wait_for_nodes
+    bootstrap_cluster
+    get_kubeconfig
+    verify_cluster
+    print_summary
+}
+
+# Run main function
+main
diff --git a/check-cluster-status.sh b/check-cluster-status.sh
new file mode 100755
index 0000000..f0dff10
--- /dev/null
+++ b/check-cluster-status.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Configuration
+CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
+TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}"
+
+# Colors for output
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check if talosconfig exists
+if [ ! -f "$TALOSCONFIG" ]; then
+    log_error "TALOSCONFIG not found at: $TALOSCONFIG"
+    log_info "Have you run ./bootstrap-cluster.sh yet?"
+    exit 1
+fi
+
+export TALOSCONFIG
+
+echo "=========================================="
+echo "Talos Cluster Status Check"
+echo "=========================================="
+echo ""
+
+# Check each node
+for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+    echo "==================== Node: $NODE_IP ===================="
+
+    # Check if node is accessible
+    log_info "Checking if node is accessible..."
+    if talosctl --nodes "$NODE_IP" version &> /dev/null; then
+        log_success "Node is accessible"
+    else
+        log_error "Node is NOT accessible"
+        echo ""
+        continue
+    fi
+
+    # Check version
+    echo ""
+    log_info "Talos version:"
+    talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version"
+
+    # Check if in maintenance mode
+    echo ""
+    log_info "Checking if node is in maintenance mode..."
+    if talosctl --nodes "$NODE_IP" get services &> /dev/null; then
+        log_success "Node is OUT of maintenance mode (configured)"
+    else
+        log_error "Node is IN MAINTENANCE MODE - configuration not applied!"
+        log_info "To apply config, run:"
+        log_info "  talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
+    fi
+
+    # Check services
+    echo ""
+    log_info "Service status:"
+    talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services"
+
+    # Check etcd status
+    echo ""
+    log_info "etcd status:"
+    if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
+        log_success "etcd is RUNNING"
+        talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE"
+    else
+        log_warning "etcd is NOT running"
+        talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet"
+    fi
+
+    # Check if etcd members exist
+    echo ""
+    log_info "etcd members:"
+    if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then
+        log_success "etcd members found"
+    else
+        log_warning "No etcd members - cluster needs bootstrap"
+    fi
+
+    echo ""
+done
+
+# Overall cluster status
+echo "==================== Overall Cluster Status ===================="
+
+# Check if any node has etcd running
+ETCD_RUNNING=false
+for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+    if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
+        ETCD_RUNNING=true
+        break
+    fi
+done
+
+echo ""
+if $ETCD_RUNNING; then
+    log_success "Cluster appears to be bootstrapped (etcd running)"
+
+    # Try to get kubeconfig
+    echo ""
+    log_info "Attempting to retrieve kubeconfig..."
+    if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then
+        log_success "Kubeconfig retrieved successfully"
+
+        log_info "Kubernetes node status:"
+        KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes"
+
+        rm -f ./kubeconfig-test
+    else
+        log_warning "Could not retrieve kubeconfig"
+    fi
+else
+    log_warning "Cluster is NOT bootstrapped yet"
+    log_info ""
+    log_info "Next steps:"
+    log_info "1. Ensure all nodes are out of maintenance mode (see checks above)"
+    log_info "2. If nodes are in maintenance mode, apply configs:"
+    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
+        log_info "   talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
+    done
+    log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)"
+    log_info "4. Bootstrap the cluster:"
+    log_info "   talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}"
+fi
+
+echo ""
+echo "=========================================="