Talos/bootstrap-cluster.sh

#!/usr/bin/env bash

set -euo pipefail

# Configuration
CLUSTER_NAME="talos-cluster"
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
CLUSTER_ENDPOINT="https://10.0.1.3:6443"
KUBERNETES_VERSION="1.33.0"
OUTPUT_DIR="testing1"

# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color

log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

log_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Check prerequisites
check_prerequisites() {
    log_info "Checking prerequisites..."

    if ! command -v talosctl &> /dev/null; then
        log_error "talosctl not found. Please run 'nix-shell' first."
        exit 1
    fi

    if ! command -v kubectl &> /dev/null; then
        log_error "kubectl not found. Please run 'nix-shell' first."
        exit 1
    fi

    log_success "All prerequisites met"
}

# Generate Talos secrets and configurations
generate_configs() {
    log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}"

    # Create output directory if it doesn't exist
    mkdir -p "${OUTPUT_DIR}"

    # Generate secrets
    talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml"
    log_success "Secrets generated"

    # Generate configs for all 3 control plane nodes
    log_info "Generating machine configurations..."

    for i in "${!CONTROL_PLANE_NODES[@]}"; do
        NODE_IP="${CONTROL_PLANE_NODES[$i]}"
        log_info "Generating config for control plane node: ${NODE_IP}"

        talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
            --with-secrets "${OUTPUT_DIR}/secrets.yaml" \
            --kubernetes-version="${KUBERNETES_VERSION}" \
            --output-types controlplane \
            --output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \
            --force \
            --config-patch @<(cat <<EOF
machine:
  network:
    hostname: cp-${i}
  certSANs:
    - ${NODE_IP}
    - 10.0.1.3
    - 10.0.1.4
    - 10.0.1.5
cluster:
  allowSchedulingOnControlPlanes: true
  controlPlane:
    endpoint: ${CLUSTER_ENDPOINT}
EOF
)
    done

    # Generate talosconfig
    talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
        --with-secrets "${OUTPUT_DIR}/secrets.yaml" \
        --output-types talosconfig \
        --force \
        --output "${OUTPUT_DIR}/.talosconfig"

    # Configure talosctl to use the new config
    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    # Add all endpoints to talosconfig
    talosctl config endpoint "${CONTROL_PLANE_NODES[@]}"
    talosctl config node "${CONTROL_PLANE_NODES[0]}"

    log_success "All configurations generated in ${OUTPUT_DIR}/"
}

# Apply configurations to nodes
apply_configs() {
    log_info "Applying configurations to nodes..."

    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
        log_info "Applying config to ${NODE_IP}..."

        # Apply config with --insecure flag for initial bootstrap
        if talosctl apply-config \
            --insecure \
            --nodes "${NODE_IP}" \
            --file "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml"; then
            log_success "Configuration applied to ${NODE_IP}"
        else
            log_error "Failed to apply configuration to ${NODE_IP}"
            exit 1
        fi

        # Brief pause between nodes
        sleep 2
    done

    log_success "Configurations applied to all nodes"
}

# Wait for nodes to be ready
wait_for_nodes() {
    log_info "Waiting for nodes to reboot and be ready..."

    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    # Wait for each node to be accessible
    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
        log_info "Waiting for node ${NODE_IP} to be accessible..."

        local max_attempts=60
        local attempt=0

        while [ $attempt -lt $max_attempts ]; do
            if talosctl --nodes "${NODE_IP}" version &> /dev/null 2>&1; then
                log_success "Node ${NODE_IP} is responding"
                break
            fi

            attempt=$((attempt + 1))
            sleep 5
        done

        if [ $attempt -eq $max_attempts ]; then
            log_error "Node ${NODE_IP} did not become accessible in time"
            exit 1
        fi
    done

    # Wait for all nodes to be out of maintenance mode and services ready
    log_info "Checking that all nodes are out of maintenance mode..."

    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
        local max_attempts=60
        local attempt=0

        while [ $attempt -lt $max_attempts ]; do
            log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..."

            # Get service state - if this succeeds, node is configured
            if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then
                log_success "Node ${NODE_IP} is out of maintenance mode"
                break
            fi

            attempt=$((attempt + 1))
            sleep 5
        done

        if [ $attempt -eq $max_attempts ]; then
            log_error "Node ${NODE_IP} did not exit maintenance mode"
            log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services"
            exit 1
        fi
    done

    # Additional wait to ensure etcd service is ready for bootstrap
    log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..."
    sleep 10

    log_success "All nodes are ready for bootstrapping"
}

# Check if etcd is already bootstrapped
check_etcd_status() {
    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    log_info "Checking if etcd is already bootstrapped..."

    # Check if etcd service is running
    if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
        log_warning "etcd is already running - cluster appears to be bootstrapped"
        return 1
    fi

    return 0
}

# Bootstrap etcd on the first control plane node
bootstrap_cluster() {
    log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}"

    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    # Check if already bootstrapped
    if ! check_etcd_status; then
        log_warning "Skipping bootstrap as cluster is already bootstrapped"
        return 0
    fi

    # Verify the node is ready for bootstrap
    log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..."
    if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then
        log_warning "etcd members not yet initialized, proceeding with bootstrap..."
    fi

    # Perform bootstrap
    log_info "Running bootstrap command..."
    if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then
        log_success "Bootstrap command executed successfully"
    else
        log_error "Failed to bootstrap etcd"
        log_error "This may be because:"
        log_error "  1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)"
        log_error "  2. The configuration was not properly applied"
        log_error "  3. etcd is already bootstrapped"
        exit 1
    fi

    # Wait for etcd to come up
    log_info "Waiting for etcd to start..."
    local max_attempts=30
    local attempt=0

    while [ $attempt -lt $max_attempts ]; do
        if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
            log_success "etcd is running"
            break
        fi

        attempt=$((attempt + 1))
        sleep 5
    done

    if [ $attempt -eq $max_attempts ]; then
        log_warning "etcd did not start in expected time, but continuing..."
    fi

    log_info "Waiting for Kubernetes to initialize..."
    sleep 30
}

# Retrieve kubeconfig
get_kubeconfig() {
    log_info "Retrieving kubeconfig..."

    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"

    local max_attempts=20
    local attempt=0

    while [ $attempt -lt $max_attempts ]; do
        log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..."

        if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then
            log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig"
            break
        fi

        attempt=$((attempt + 1))
        sleep 10
    done

    if [ $attempt -eq $max_attempts ]; then
        log_error "Failed to retrieve kubeconfig"
        exit 1
    fi
}

# Verify cluster health
verify_cluster() {
    log_info "Verifying cluster health..."

    export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
    export KUBECONFIG="${OUTPUT_DIR}/kubeconfig"

    log_info "Checking Talos health..."
    if talosctl health --wait-timeout 5m; then
        log_success "Talos cluster is healthy"
    else
        log_warning "Talos health check reported issues"
    fi

    log_info "Checking Kubernetes nodes..."
    kubectl get nodes -o wide

    log_info "Checking system pods..."
    kubectl get pods -A

    log_success "Cluster verification complete"
}

# Print summary
print_summary() {
    echo ""
    echo "=========================================="
    log_success "Talos Cluster Bootstrap Complete!"
    echo "=========================================="
    echo ""
    echo "Cluster Name: ${CLUSTER_NAME}"
    echo "Control Plane Nodes:"
    for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
        echo "  - ${NODE_IP}"
    done
    echo ""
    echo "Configuration Files:"
    echo "  - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig"
    echo "  - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig"
    echo ""
    echo "To use the cluster, export these variables:"
    echo "  export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\""
    echo "  export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\""
    echo ""
    echo "Or run: nix-shell (which sets these automatically)"
    echo ""
    echo "Useful commands:"
    echo "  talosctl health"
    echo "  kubectl get nodes"
    echo "  kubectl get pods -A"
    echo "=========================================="
}

# Main execution
main() {
    log_info "Starting Talos Cluster Bootstrap"
    log_info "Cluster: ${CLUSTER_NAME}"
    log_info "Nodes: ${CONTROL_PLANE_NODES[*]}"
    echo ""

    check_prerequisites
    generate_configs
    apply_configs
    wait_for_nodes
    bootstrap_cluster
    get_kubeconfig
    verify_cluster
    print_summary
}

# Run main function
main