#!/usr/bin/env bash set -euo pipefail # Configuration CLUSTER_NAME="talos-cluster" CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5") CLUSTER_ENDPOINT="https://10.0.1.3:6443" KUBERNETES_VERSION="1.33.0" OUTPUT_DIR="testing1" # Colors for output GREEN='\033[0;32m' BLUE='\033[0;34m' YELLOW='\033[1;33m' RED='\033[0;31m' NC='\033[0m' # No Color log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } # Check prerequisites check_prerequisites() { log_info "Checking prerequisites..." if ! command -v talosctl &> /dev/null; then log_error "talosctl not found. Please run 'nix-shell' first." exit 1 fi if ! command -v kubectl &> /dev/null; then log_error "kubectl not found. Please run 'nix-shell' first." exit 1 fi log_success "All prerequisites met" } # Generate Talos secrets and configurations generate_configs() { log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}" # Create output directory if it doesn't exist mkdir -p "${OUTPUT_DIR}" # Generate secrets talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml" log_success "Secrets generated" # Generate configs for all 3 control plane nodes log_info "Generating machine configurations..." for i in "${!CONTROL_PLANE_NODES[@]}"; do NODE_IP="${CONTROL_PLANE_NODES[$i]}" log_info "Generating config for control plane node: ${NODE_IP}" talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \ --with-secrets "${OUTPUT_DIR}/secrets.yaml" \ --kubernetes-version="${KUBERNETES_VERSION}" \ --output-types controlplane \ --output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \ --force \ --config-patch @<(cat < /dev/null 2>&1; then log_success "Node ${NODE_IP} is responding" break fi attempt=$((attempt + 1)) sleep 5 done if [ $attempt -eq $max_attempts ]; then log_error "Node ${NODE_IP} did not become accessible in time" exit 1 fi done # Wait for all nodes to be out of maintenance mode and services ready log_info "Checking that all nodes are out of maintenance mode..." for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do local max_attempts=60 local attempt=0 while [ $attempt -lt $max_attempts ]; do log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..." # Get service state - if this succeeds, node is configured if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then log_success "Node ${NODE_IP} is out of maintenance mode" break fi attempt=$((attempt + 1)) sleep 5 done if [ $attempt -eq $max_attempts ]; then log_error "Node ${NODE_IP} did not exit maintenance mode" log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services" exit 1 fi done # Additional wait to ensure etcd service is ready for bootstrap log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..." sleep 10 log_success "All nodes are ready for bootstrapping" } # Check if etcd is already bootstrapped check_etcd_status() { export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" log_info "Checking if etcd is already bootstrapped..." # Check if etcd service is running if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then log_warning "etcd is already running - cluster appears to be bootstrapped" return 1 fi return 0 } # Bootstrap etcd on the first control plane node bootstrap_cluster() { log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}" export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" # Check if already bootstrapped if ! check_etcd_status; then log_warning "Skipping bootstrap as cluster is already bootstrapped" return 0 fi # Verify the node is ready for bootstrap log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..." if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then log_warning "etcd members not yet initialized, proceeding with bootstrap..." fi # Perform bootstrap log_info "Running bootstrap command..." if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then log_success "Bootstrap command executed successfully" else log_error "Failed to bootstrap etcd" log_error "This may be because:" log_error " 1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)" log_error " 2. The configuration was not properly applied" log_error " 3. etcd is already bootstrapped" exit 1 fi # Wait for etcd to come up log_info "Waiting for etcd to start..." local max_attempts=30 local attempt=0 while [ $attempt -lt $max_attempts ]; do if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then log_success "etcd is running" break fi attempt=$((attempt + 1)) sleep 5 done if [ $attempt -eq $max_attempts ]; then log_warning "etcd did not start in expected time, but continuing..." fi log_info "Waiting for Kubernetes to initialize..." sleep 30 } # Retrieve kubeconfig get_kubeconfig() { log_info "Retrieving kubeconfig..." export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" local max_attempts=20 local attempt=0 while [ $attempt -lt $max_attempts ]; do log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..." if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig" break fi attempt=$((attempt + 1)) sleep 10 done if [ $attempt -eq $max_attempts ]; then log_error "Failed to retrieve kubeconfig" exit 1 fi } # Verify cluster health verify_cluster() { log_info "Verifying cluster health..." export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig" export KUBECONFIG="${OUTPUT_DIR}/kubeconfig" log_info "Checking Talos health..." if talosctl health --wait-timeout 5m; then log_success "Talos cluster is healthy" else log_warning "Talos health check reported issues" fi log_info "Checking Kubernetes nodes..." kubectl get nodes -o wide log_info "Checking system pods..." kubectl get pods -A log_success "Cluster verification complete" } # Print summary print_summary() { echo "" echo "==========================================" log_success "Talos Cluster Bootstrap Complete!" echo "==========================================" echo "" echo "Cluster Name: ${CLUSTER_NAME}" echo "Control Plane Nodes:" for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do echo " - ${NODE_IP}" done echo "" echo "Configuration Files:" echo " - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig" echo " - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig" echo "" echo "To use the cluster, export these variables:" echo " export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\"" echo " export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\"" echo "" echo "Or run: nix-shell (which sets these automatically)" echo "" echo "Useful commands:" echo " talosctl health" echo " kubectl get nodes" echo " kubectl get pods -A" echo "==========================================" } # Main execution main() { log_info "Starting Talos Cluster Bootstrap" log_info "Cluster: ${CLUSTER_NAME}" log_info "Nodes: ${CONTROL_PLANE_NODES[*]}" echo "" check_prerequisites generate_configs apply_configs wait_for_nodes bootstrap_cluster get_kubeconfig verify_cluster print_summary } # Run main function main