Talos/bootstrap-cluster.sh
0xWheatyz 6c292da5f1 feat(scripts): add cluster bootstrap and status scripts
Add automated scripts for Talos cluster management:

bootstrap-cluster.sh:
- Automated cluster bootstrap from scratch
- Generates Talos secrets and machine configs
- Applies configs to all nodes (10.0.1.3-5)
- Bootstraps etcd and retrieves kubeconfig
- Verifies cluster health

check-cluster-status.sh:
- Comprehensive cluster health diagnostics
- Checks Talos services, etcd, and Kubernetes components
- Displays node status and running pods
- Useful for troubleshooting bootstrap issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-04 01:53:05 +00:00

368 lines
10 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Configuration
CLUSTER_NAME="talos-cluster"
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
CLUSTER_ENDPOINT="https://10.0.1.3:6443"
KUBERNETES_VERSION="1.33.0"
OUTPUT_DIR="testing1"
# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Check prerequisites
check_prerequisites() {
log_info "Checking prerequisites..."
if ! command -v talosctl &> /dev/null; then
log_error "talosctl not found. Please run 'nix-shell' first."
exit 1
fi
if ! command -v kubectl &> /dev/null; then
log_error "kubectl not found. Please run 'nix-shell' first."
exit 1
fi
log_success "All prerequisites met"
}
# Generate Talos secrets and configurations
generate_configs() {
log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}"
# Create output directory if it doesn't exist
mkdir -p "${OUTPUT_DIR}"
# Generate secrets
talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml"
log_success "Secrets generated"
# Generate configs for all 3 control plane nodes
log_info "Generating machine configurations..."
for i in "${!CONTROL_PLANE_NODES[@]}"; do
NODE_IP="${CONTROL_PLANE_NODES[$i]}"
log_info "Generating config for control plane node: ${NODE_IP}"
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
--kubernetes-version="${KUBERNETES_VERSION}" \
--output-types controlplane \
--output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \
--force \
--config-patch @<(cat <<EOF
machine:
network:
hostname: cp-${i}
certSANs:
- ${NODE_IP}
- 10.0.1.3
- 10.0.1.4
- 10.0.1.5
cluster:
allowSchedulingOnControlPlanes: true
controlPlane:
endpoint: ${CLUSTER_ENDPOINT}
EOF
)
done
# Generate talosconfig
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
--output-types talosconfig \
--force \
--output "${OUTPUT_DIR}/.talosconfig"
# Configure talosctl to use the new config
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
# Add all endpoints to talosconfig
talosctl config endpoint "${CONTROL_PLANE_NODES[@]}"
talosctl config node "${CONTROL_PLANE_NODES[0]}"
log_success "All configurations generated in ${OUTPUT_DIR}/"
}
# Apply configurations to nodes
apply_configs() {
log_info "Applying configurations to nodes..."
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
log_info "Applying config to ${NODE_IP}..."
# Apply config with --insecure flag for initial bootstrap
if talosctl apply-config \
--insecure \
--nodes "${NODE_IP}" \
--file "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml"; then
log_success "Configuration applied to ${NODE_IP}"
else
log_error "Failed to apply configuration to ${NODE_IP}"
exit 1
fi
# Brief pause between nodes
sleep 2
done
log_success "Configurations applied to all nodes"
}
# Wait for nodes to be ready
wait_for_nodes() {
log_info "Waiting for nodes to reboot and be ready..."
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
# Wait for each node to be accessible
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
log_info "Waiting for node ${NODE_IP} to be accessible..."
local max_attempts=60
local attempt=0
while [ $attempt -lt $max_attempts ]; do
if talosctl --nodes "${NODE_IP}" version &> /dev/null 2>&1; then
log_success "Node ${NODE_IP} is responding"
break
fi
attempt=$((attempt + 1))
sleep 5
done
if [ $attempt -eq $max_attempts ]; then
log_error "Node ${NODE_IP} did not become accessible in time"
exit 1
fi
done
# Wait for all nodes to be out of maintenance mode and services ready
log_info "Checking that all nodes are out of maintenance mode..."
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
local max_attempts=60
local attempt=0
while [ $attempt -lt $max_attempts ]; do
log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..."
# Get service state - if this succeeds, node is configured
if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then
log_success "Node ${NODE_IP} is out of maintenance mode"
break
fi
attempt=$((attempt + 1))
sleep 5
done
if [ $attempt -eq $max_attempts ]; then
log_error "Node ${NODE_IP} did not exit maintenance mode"
log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services"
exit 1
fi
done
# Additional wait to ensure etcd service is ready for bootstrap
log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..."
sleep 10
log_success "All nodes are ready for bootstrapping"
}
# Check if etcd is already bootstrapped
check_etcd_status() {
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
log_info "Checking if etcd is already bootstrapped..."
# Check if etcd service is running
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
log_warning "etcd is already running - cluster appears to be bootstrapped"
return 1
fi
return 0
}
# Bootstrap etcd on the first control plane node
bootstrap_cluster() {
log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}"
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
# Check if already bootstrapped
if ! check_etcd_status; then
log_warning "Skipping bootstrap as cluster is already bootstrapped"
return 0
fi
# Verify the node is ready for bootstrap
log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..."
if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then
log_warning "etcd members not yet initialized, proceeding with bootstrap..."
fi
# Perform bootstrap
log_info "Running bootstrap command..."
if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then
log_success "Bootstrap command executed successfully"
else
log_error "Failed to bootstrap etcd"
log_error "This may be because:"
log_error " 1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)"
log_error " 2. The configuration was not properly applied"
log_error " 3. etcd is already bootstrapped"
exit 1
fi
# Wait for etcd to come up
log_info "Waiting for etcd to start..."
local max_attempts=30
local attempt=0
while [ $attempt -lt $max_attempts ]; do
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
log_success "etcd is running"
break
fi
attempt=$((attempt + 1))
sleep 5
done
if [ $attempt -eq $max_attempts ]; then
log_warning "etcd did not start in expected time, but continuing..."
fi
log_info "Waiting for Kubernetes to initialize..."
sleep 30
}
# Retrieve kubeconfig
get_kubeconfig() {
log_info "Retrieving kubeconfig..."
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
local max_attempts=20
local attempt=0
while [ $attempt -lt $max_attempts ]; do
log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..."
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then
log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig"
break
fi
attempt=$((attempt + 1))
sleep 10
done
if [ $attempt -eq $max_attempts ]; then
log_error "Failed to retrieve kubeconfig"
exit 1
fi
}
# Verify cluster health
verify_cluster() {
log_info "Verifying cluster health..."
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
export KUBECONFIG="${OUTPUT_DIR}/kubeconfig"
log_info "Checking Talos health..."
if talosctl health --wait-timeout 5m; then
log_success "Talos cluster is healthy"
else
log_warning "Talos health check reported issues"
fi
log_info "Checking Kubernetes nodes..."
kubectl get nodes -o wide
log_info "Checking system pods..."
kubectl get pods -A
log_success "Cluster verification complete"
}
# Print summary
print_summary() {
echo ""
echo "=========================================="
log_success "Talos Cluster Bootstrap Complete!"
echo "=========================================="
echo ""
echo "Cluster Name: ${CLUSTER_NAME}"
echo "Control Plane Nodes:"
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
echo " - ${NODE_IP}"
done
echo ""
echo "Configuration Files:"
echo " - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig"
echo " - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig"
echo ""
echo "To use the cluster, export these variables:"
echo " export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\""
echo " export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\""
echo ""
echo "Or run: nix-shell (which sets these automatically)"
echo ""
echo "Useful commands:"
echo " talosctl health"
echo " kubectl get nodes"
echo " kubectl get pods -A"
echo "=========================================="
}
# Main execution
main() {
log_info "Starting Talos Cluster Bootstrap"
log_info "Cluster: ${CLUSTER_NAME}"
log_info "Nodes: ${CONTROL_PLANE_NODES[*]}"
echo ""
check_prerequisites
generate_configs
apply_configs
wait_for_nodes
bootstrap_cluster
get_kubeconfig
verify_cluster
print_summary
}
# Run main function
main