Add automated scripts for Talos cluster management: bootstrap-cluster.sh: - Automated cluster bootstrap from scratch - Generates Talos secrets and machine configs - Applies configs to all nodes (10.0.1.3-5) - Bootstraps etcd and retrieves kubeconfig - Verifies cluster health check-cluster-status.sh: - Comprehensive cluster health diagnostics - Checks Talos services, etcd, and Kubernetes components - Displays node status and running pods - Useful for troubleshooting bootstrap issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
368 lines
10 KiB
Bash
Executable File
368 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
CLUSTER_NAME="talos-cluster"
|
|
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
|
|
CLUSTER_ENDPOINT="https://10.0.1.3:6443"
|
|
KUBERNETES_VERSION="1.33.0"
|
|
OUTPUT_DIR="testing1"
|
|
|
|
# Colors for output
|
|
GREEN='\033[0;32m'
|
|
BLUE='\033[0;34m'
|
|
YELLOW='\033[1;33m'
|
|
RED='\033[0;31m'
|
|
NC='\033[0m' # No Color
|
|
|
|
log_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_success() {
|
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
}
|
|
|
|
log_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
# Check prerequisites
|
|
check_prerequisites() {
|
|
log_info "Checking prerequisites..."
|
|
|
|
if ! command -v talosctl &> /dev/null; then
|
|
log_error "talosctl not found. Please run 'nix-shell' first."
|
|
exit 1
|
|
fi
|
|
|
|
if ! command -v kubectl &> /dev/null; then
|
|
log_error "kubectl not found. Please run 'nix-shell' first."
|
|
exit 1
|
|
fi
|
|
|
|
log_success "All prerequisites met"
|
|
}
|
|
|
|
# Generate Talos secrets and configurations
|
|
generate_configs() {
|
|
log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}"
|
|
|
|
# Create output directory if it doesn't exist
|
|
mkdir -p "${OUTPUT_DIR}"
|
|
|
|
# Generate secrets
|
|
talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml"
|
|
log_success "Secrets generated"
|
|
|
|
# Generate configs for all 3 control plane nodes
|
|
log_info "Generating machine configurations..."
|
|
|
|
for i in "${!CONTROL_PLANE_NODES[@]}"; do
|
|
NODE_IP="${CONTROL_PLANE_NODES[$i]}"
|
|
log_info "Generating config for control plane node: ${NODE_IP}"
|
|
|
|
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
|
|
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
|
|
--kubernetes-version="${KUBERNETES_VERSION}" \
|
|
--output-types controlplane \
|
|
--output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \
|
|
--force \
|
|
--config-patch @<(cat <<EOF
|
|
machine:
|
|
network:
|
|
hostname: cp-${i}
|
|
certSANs:
|
|
- ${NODE_IP}
|
|
- 10.0.1.3
|
|
- 10.0.1.4
|
|
- 10.0.1.5
|
|
cluster:
|
|
allowSchedulingOnControlPlanes: true
|
|
controlPlane:
|
|
endpoint: ${CLUSTER_ENDPOINT}
|
|
EOF
|
|
)
|
|
done
|
|
|
|
# Generate talosconfig
|
|
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
|
|
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
|
|
--output-types talosconfig \
|
|
--force \
|
|
--output "${OUTPUT_DIR}/.talosconfig"
|
|
|
|
# Configure talosctl to use the new config
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
# Add all endpoints to talosconfig
|
|
talosctl config endpoint "${CONTROL_PLANE_NODES[@]}"
|
|
talosctl config node "${CONTROL_PLANE_NODES[0]}"
|
|
|
|
log_success "All configurations generated in ${OUTPUT_DIR}/"
|
|
}
|
|
|
|
# Apply configurations to nodes
|
|
apply_configs() {
|
|
log_info "Applying configurations to nodes..."
|
|
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
log_info "Applying config to ${NODE_IP}..."
|
|
|
|
# Apply config with --insecure flag for initial bootstrap
|
|
if talosctl apply-config \
|
|
--insecure \
|
|
--nodes "${NODE_IP}" \
|
|
--file "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml"; then
|
|
log_success "Configuration applied to ${NODE_IP}"
|
|
else
|
|
log_error "Failed to apply configuration to ${NODE_IP}"
|
|
exit 1
|
|
fi
|
|
|
|
# Brief pause between nodes
|
|
sleep 2
|
|
done
|
|
|
|
log_success "Configurations applied to all nodes"
|
|
}
|
|
|
|
# Wait for nodes to be ready
|
|
wait_for_nodes() {
|
|
log_info "Waiting for nodes to reboot and be ready..."
|
|
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
# Wait for each node to be accessible
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
log_info "Waiting for node ${NODE_IP} to be accessible..."
|
|
|
|
local max_attempts=60
|
|
local attempt=0
|
|
|
|
while [ $attempt -lt $max_attempts ]; do
|
|
if talosctl --nodes "${NODE_IP}" version &> /dev/null 2>&1; then
|
|
log_success "Node ${NODE_IP} is responding"
|
|
break
|
|
fi
|
|
|
|
attempt=$((attempt + 1))
|
|
sleep 5
|
|
done
|
|
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
log_error "Node ${NODE_IP} did not become accessible in time"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
# Wait for all nodes to be out of maintenance mode and services ready
|
|
log_info "Checking that all nodes are out of maintenance mode..."
|
|
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
local max_attempts=60
|
|
local attempt=0
|
|
|
|
while [ $attempt -lt $max_attempts ]; do
|
|
log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..."
|
|
|
|
# Get service state - if this succeeds, node is configured
|
|
if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then
|
|
log_success "Node ${NODE_IP} is out of maintenance mode"
|
|
break
|
|
fi
|
|
|
|
attempt=$((attempt + 1))
|
|
sleep 5
|
|
done
|
|
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
log_error "Node ${NODE_IP} did not exit maintenance mode"
|
|
log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
# Additional wait to ensure etcd service is ready for bootstrap
|
|
log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..."
|
|
sleep 10
|
|
|
|
log_success "All nodes are ready for bootstrapping"
|
|
}
|
|
|
|
# Check if etcd is already bootstrapped
|
|
check_etcd_status() {
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
log_info "Checking if etcd is already bootstrapped..."
|
|
|
|
# Check if etcd service is running
|
|
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
|
log_warning "etcd is already running - cluster appears to be bootstrapped"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Bootstrap etcd on the first control plane node
|
|
bootstrap_cluster() {
|
|
log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}"
|
|
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
# Check if already bootstrapped
|
|
if ! check_etcd_status; then
|
|
log_warning "Skipping bootstrap as cluster is already bootstrapped"
|
|
return 0
|
|
fi
|
|
|
|
# Verify the node is ready for bootstrap
|
|
log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..."
|
|
if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then
|
|
log_warning "etcd members not yet initialized, proceeding with bootstrap..."
|
|
fi
|
|
|
|
# Perform bootstrap
|
|
log_info "Running bootstrap command..."
|
|
if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then
|
|
log_success "Bootstrap command executed successfully"
|
|
else
|
|
log_error "Failed to bootstrap etcd"
|
|
log_error "This may be because:"
|
|
log_error " 1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)"
|
|
log_error " 2. The configuration was not properly applied"
|
|
log_error " 3. etcd is already bootstrapped"
|
|
exit 1
|
|
fi
|
|
|
|
# Wait for etcd to come up
|
|
log_info "Waiting for etcd to start..."
|
|
local max_attempts=30
|
|
local attempt=0
|
|
|
|
while [ $attempt -lt $max_attempts ]; do
|
|
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
|
log_success "etcd is running"
|
|
break
|
|
fi
|
|
|
|
attempt=$((attempt + 1))
|
|
sleep 5
|
|
done
|
|
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
log_warning "etcd did not start in expected time, but continuing..."
|
|
fi
|
|
|
|
log_info "Waiting for Kubernetes to initialize..."
|
|
sleep 30
|
|
}
|
|
|
|
# Retrieve kubeconfig
|
|
get_kubeconfig() {
|
|
log_info "Retrieving kubeconfig..."
|
|
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
|
|
local max_attempts=20
|
|
local attempt=0
|
|
|
|
while [ $attempt -lt $max_attempts ]; do
|
|
log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..."
|
|
|
|
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then
|
|
log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig"
|
|
break
|
|
fi
|
|
|
|
attempt=$((attempt + 1))
|
|
sleep 10
|
|
done
|
|
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
log_error "Failed to retrieve kubeconfig"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Verify cluster health
|
|
verify_cluster() {
|
|
log_info "Verifying cluster health..."
|
|
|
|
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
|
export KUBECONFIG="${OUTPUT_DIR}/kubeconfig"
|
|
|
|
log_info "Checking Talos health..."
|
|
if talosctl health --wait-timeout 5m; then
|
|
log_success "Talos cluster is healthy"
|
|
else
|
|
log_warning "Talos health check reported issues"
|
|
fi
|
|
|
|
log_info "Checking Kubernetes nodes..."
|
|
kubectl get nodes -o wide
|
|
|
|
log_info "Checking system pods..."
|
|
kubectl get pods -A
|
|
|
|
log_success "Cluster verification complete"
|
|
}
|
|
|
|
# Print summary
|
|
print_summary() {
|
|
echo ""
|
|
echo "=========================================="
|
|
log_success "Talos Cluster Bootstrap Complete!"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo "Cluster Name: ${CLUSTER_NAME}"
|
|
echo "Control Plane Nodes:"
|
|
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
|
echo " - ${NODE_IP}"
|
|
done
|
|
echo ""
|
|
echo "Configuration Files:"
|
|
echo " - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig"
|
|
echo " - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig"
|
|
echo ""
|
|
echo "To use the cluster, export these variables:"
|
|
echo " export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\""
|
|
echo " export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\""
|
|
echo ""
|
|
echo "Or run: nix-shell (which sets these automatically)"
|
|
echo ""
|
|
echo "Useful commands:"
|
|
echo " talosctl health"
|
|
echo " kubectl get nodes"
|
|
echo " kubectl get pods -A"
|
|
echo "=========================================="
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
log_info "Starting Talos Cluster Bootstrap"
|
|
log_info "Cluster: ${CLUSTER_NAME}"
|
|
log_info "Nodes: ${CONTROL_PLANE_NODES[*]}"
|
|
echo ""
|
|
|
|
check_prerequisites
|
|
generate_configs
|
|
apply_configs
|
|
wait_for_nodes
|
|
bootstrap_cluster
|
|
get_kubeconfig
|
|
verify_cluster
|
|
print_summary
|
|
}
|
|
|
|
# Run main function
|
|
main
|