feat(scripts): add cluster bootstrap and status scripts
Add automated scripts for Talos cluster management: bootstrap-cluster.sh: - Automated cluster bootstrap from scratch - Generates Talos secrets and machine configs - Applies configs to all nodes (10.0.1.3-5) - Bootstraps etcd and retrieves kubeconfig - Verifies cluster health check-cluster-status.sh: - Comprehensive cluster health diagnostics - Checks Talos services, etcd, and Kubernetes components - Displays node status and running pods - Useful for troubleshooting bootstrap issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
2ed1e82953
commit
6c292da5f1
367
bootstrap-cluster.sh
Executable file
367
bootstrap-cluster.sh
Executable file
@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
CLUSTER_NAME="talos-cluster"
|
||||
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
|
||||
CLUSTER_ENDPOINT="https://10.0.1.3:6443"
|
||||
KUBERNETES_VERSION="1.33.0"
|
||||
OUTPUT_DIR="testing1"
|
||||
|
||||
# Colors for output
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Check prerequisites
|
||||
check_prerequisites() {
|
||||
log_info "Checking prerequisites..."
|
||||
|
||||
if ! command -v talosctl &> /dev/null; then
|
||||
log_error "talosctl not found. Please run 'nix-shell' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
log_error "kubectl not found. Please run 'nix-shell' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "All prerequisites met"
|
||||
}
|
||||
|
||||
# Generate Talos secrets and configurations
|
||||
generate_configs() {
|
||||
log_info "Generating Talos secrets for cluster: ${CLUSTER_NAME}"
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
mkdir -p "${OUTPUT_DIR}"
|
||||
|
||||
# Generate secrets
|
||||
talosctl gen secrets --force -o "${OUTPUT_DIR}/secrets.yaml"
|
||||
log_success "Secrets generated"
|
||||
|
||||
# Generate configs for all 3 control plane nodes
|
||||
log_info "Generating machine configurations..."
|
||||
|
||||
for i in "${!CONTROL_PLANE_NODES[@]}"; do
|
||||
NODE_IP="${CONTROL_PLANE_NODES[$i]}"
|
||||
log_info "Generating config for control plane node: ${NODE_IP}"
|
||||
|
||||
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
|
||||
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
|
||||
--kubernetes-version="${KUBERNETES_VERSION}" \
|
||||
--output-types controlplane \
|
||||
--output "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml" \
|
||||
--force \
|
||||
--config-patch @<(cat <<EOF
|
||||
machine:
|
||||
network:
|
||||
hostname: cp-${i}
|
||||
certSANs:
|
||||
- ${NODE_IP}
|
||||
- 10.0.1.3
|
||||
- 10.0.1.4
|
||||
- 10.0.1.5
|
||||
cluster:
|
||||
allowSchedulingOnControlPlanes: true
|
||||
controlPlane:
|
||||
endpoint: ${CLUSTER_ENDPOINT}
|
||||
EOF
|
||||
)
|
||||
done
|
||||
|
||||
# Generate talosconfig
|
||||
talosctl gen config "${CLUSTER_NAME}" "${CLUSTER_ENDPOINT}" \
|
||||
--with-secrets "${OUTPUT_DIR}/secrets.yaml" \
|
||||
--output-types talosconfig \
|
||||
--force \
|
||||
--output "${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
# Configure talosctl to use the new config
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
# Add all endpoints to talosconfig
|
||||
talosctl config endpoint "${CONTROL_PLANE_NODES[@]}"
|
||||
talosctl config node "${CONTROL_PLANE_NODES[0]}"
|
||||
|
||||
log_success "All configurations generated in ${OUTPUT_DIR}/"
|
||||
}
|
||||
|
||||
# Apply configurations to nodes
|
||||
apply_configs() {
|
||||
log_info "Applying configurations to nodes..."
|
||||
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
log_info "Applying config to ${NODE_IP}..."
|
||||
|
||||
# Apply config with --insecure flag for initial bootstrap
|
||||
if talosctl apply-config \
|
||||
--insecure \
|
||||
--nodes "${NODE_IP}" \
|
||||
--file "${OUTPUT_DIR}/controlplane-${NODE_IP}.yaml"; then
|
||||
log_success "Configuration applied to ${NODE_IP}"
|
||||
else
|
||||
log_error "Failed to apply configuration to ${NODE_IP}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Brief pause between nodes
|
||||
sleep 2
|
||||
done
|
||||
|
||||
log_success "Configurations applied to all nodes"
|
||||
}
|
||||
|
||||
# Wait for nodes to be ready
|
||||
wait_for_nodes() {
|
||||
log_info "Waiting for nodes to reboot and be ready..."
|
||||
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
# Wait for each node to be accessible
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
log_info "Waiting for node ${NODE_IP} to be accessible..."
|
||||
|
||||
local max_attempts=60
|
||||
local attempt=0
|
||||
|
||||
while [ $attempt -lt $max_attempts ]; do
|
||||
if talosctl --nodes "${NODE_IP}" version &> /dev/null 2>&1; then
|
||||
log_success "Node ${NODE_IP} is responding"
|
||||
break
|
||||
fi
|
||||
|
||||
attempt=$((attempt + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
log_error "Node ${NODE_IP} did not become accessible in time"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait for all nodes to be out of maintenance mode and services ready
|
||||
log_info "Checking that all nodes are out of maintenance mode..."
|
||||
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
local max_attempts=60
|
||||
local attempt=0
|
||||
|
||||
while [ $attempt -lt $max_attempts ]; do
|
||||
log_info "Checking services on ${NODE_IP} (attempt $((attempt + 1))/${max_attempts})..."
|
||||
|
||||
# Get service state - if this succeeds, node is configured
|
||||
if talosctl --nodes "${NODE_IP}" get services 2>&1 | grep -q "apid"; then
|
||||
log_success "Node ${NODE_IP} is out of maintenance mode"
|
||||
break
|
||||
fi
|
||||
|
||||
attempt=$((attempt + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
log_error "Node ${NODE_IP} did not exit maintenance mode"
|
||||
log_error "Try checking node console or running: talosctl --nodes ${NODE_IP} get services"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Additional wait to ensure etcd service is ready for bootstrap
|
||||
log_info "Waiting for etcd to be ready for bootstrap on ${CONTROL_PLANE_NODES[0]}..."
|
||||
sleep 10
|
||||
|
||||
log_success "All nodes are ready for bootstrapping"
|
||||
}
|
||||
|
||||
# Check if etcd is already bootstrapped
|
||||
check_etcd_status() {
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
log_info "Checking if etcd is already bootstrapped..."
|
||||
|
||||
# Check if etcd service is running
|
||||
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
||||
log_warning "etcd is already running - cluster appears to be bootstrapped"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# Bootstrap etcd on the first control plane node
|
||||
bootstrap_cluster() {
|
||||
log_info "Bootstrapping etcd on first control plane node: ${CONTROL_PLANE_NODES[0]}"
|
||||
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
# Check if already bootstrapped
|
||||
if ! check_etcd_status; then
|
||||
log_warning "Skipping bootstrap as cluster is already bootstrapped"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Verify the node is ready for bootstrap
|
||||
log_info "Verifying node ${CONTROL_PLANE_NODES[0]} is ready for bootstrap..."
|
||||
if ! talosctl --nodes "${CONTROL_PLANE_NODES[0]}" get members &> /dev/null; then
|
||||
log_warning "etcd members not yet initialized, proceeding with bootstrap..."
|
||||
fi
|
||||
|
||||
# Perform bootstrap
|
||||
log_info "Running bootstrap command..."
|
||||
if talosctl bootstrap --nodes "${CONTROL_PLANE_NODES[0]}"; then
|
||||
log_success "Bootstrap command executed successfully"
|
||||
else
|
||||
log_error "Failed to bootstrap etcd"
|
||||
log_error "This may be because:"
|
||||
log_error " 1. The node is still in maintenance mode (check with: talosctl --nodes ${CONTROL_PLANE_NODES[0]} get services)"
|
||||
log_error " 2. The configuration was not properly applied"
|
||||
log_error " 3. etcd is already bootstrapped"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Wait for etcd to come up
|
||||
log_info "Waiting for etcd to start..."
|
||||
local max_attempts=30
|
||||
local attempt=0
|
||||
|
||||
while [ $attempt -lt $max_attempts ]; do
|
||||
if talosctl --nodes "${CONTROL_PLANE_NODES[0]}" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
||||
log_success "etcd is running"
|
||||
break
|
||||
fi
|
||||
|
||||
attempt=$((attempt + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
log_warning "etcd did not start in expected time, but continuing..."
|
||||
fi
|
||||
|
||||
log_info "Waiting for Kubernetes to initialize..."
|
||||
sleep 30
|
||||
}
|
||||
|
||||
# Retrieve kubeconfig
|
||||
get_kubeconfig() {
|
||||
log_info "Retrieving kubeconfig..."
|
||||
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
|
||||
local max_attempts=20
|
||||
local attempt=0
|
||||
|
||||
while [ $attempt -lt $max_attempts ]; do
|
||||
log_info "Attempting to retrieve kubeconfig (attempt $((attempt + 1))/${max_attempts})..."
|
||||
|
||||
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" "${OUTPUT_DIR}/kubeconfig" --force; then
|
||||
log_success "Kubeconfig saved to ${OUTPUT_DIR}/kubeconfig"
|
||||
break
|
||||
fi
|
||||
|
||||
attempt=$((attempt + 1))
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
log_error "Failed to retrieve kubeconfig"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Verify cluster health
|
||||
verify_cluster() {
|
||||
log_info "Verifying cluster health..."
|
||||
|
||||
export TALOSCONFIG="${OUTPUT_DIR}/.talosconfig"
|
||||
export KUBECONFIG="${OUTPUT_DIR}/kubeconfig"
|
||||
|
||||
log_info "Checking Talos health..."
|
||||
if talosctl health --wait-timeout 5m; then
|
||||
log_success "Talos cluster is healthy"
|
||||
else
|
||||
log_warning "Talos health check reported issues"
|
||||
fi
|
||||
|
||||
log_info "Checking Kubernetes nodes..."
|
||||
kubectl get nodes -o wide
|
||||
|
||||
log_info "Checking system pods..."
|
||||
kubectl get pods -A
|
||||
|
||||
log_success "Cluster verification complete"
|
||||
}
|
||||
|
||||
# Print summary
|
||||
print_summary() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
log_success "Talos Cluster Bootstrap Complete!"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Cluster Name: ${CLUSTER_NAME}"
|
||||
echo "Control Plane Nodes:"
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
echo " - ${NODE_IP}"
|
||||
done
|
||||
echo ""
|
||||
echo "Configuration Files:"
|
||||
echo " - TALOSCONFIG: ${OUTPUT_DIR}/.talosconfig"
|
||||
echo " - KUBECONFIG: ${OUTPUT_DIR}/kubeconfig"
|
||||
echo ""
|
||||
echo "To use the cluster, export these variables:"
|
||||
echo " export TALOSCONFIG=\"\$(pwd)/${OUTPUT_DIR}/.talosconfig\""
|
||||
echo " export KUBECONFIG=\"\$(pwd)/${OUTPUT_DIR}/kubeconfig\""
|
||||
echo ""
|
||||
echo "Or run: nix-shell (which sets these automatically)"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " talosctl health"
|
||||
echo " kubectl get nodes"
|
||||
echo " kubectl get pods -A"
|
||||
echo "=========================================="
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log_info "Starting Talos Cluster Bootstrap"
|
||||
log_info "Cluster: ${CLUSTER_NAME}"
|
||||
log_info "Nodes: ${CONTROL_PLANE_NODES[*]}"
|
||||
echo ""
|
||||
|
||||
check_prerequisites
|
||||
generate_configs
|
||||
apply_configs
|
||||
wait_for_nodes
|
||||
bootstrap_cluster
|
||||
get_kubeconfig
|
||||
verify_cluster
|
||||
print_summary
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
148
check-cluster-status.sh
Executable file
148
check-cluster-status.sh
Executable file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
CONTROL_PLANE_NODES=("10.0.1.3" "10.0.1.4" "10.0.1.5")
|
||||
TALOSCONFIG="${TALOSCONFIG:-testing1/.talosconfig}"
|
||||
|
||||
# Colors for output
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m'
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Check if talosconfig exists
|
||||
if [ ! -f "$TALOSCONFIG" ]; then
|
||||
log_error "TALOSCONFIG not found at: $TALOSCONFIG"
|
||||
log_info "Have you run ./bootstrap-cluster.sh yet?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export TALOSCONFIG
|
||||
|
||||
echo "=========================================="
|
||||
echo "Talos Cluster Status Check"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# Check each node
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
echo "==================== Node: $NODE_IP ===================="
|
||||
|
||||
# Check if node is accessible
|
||||
log_info "Checking if node is accessible..."
|
||||
if talosctl --nodes "$NODE_IP" version &> /dev/null; then
|
||||
log_success "Node is accessible"
|
||||
else
|
||||
log_error "Node is NOT accessible"
|
||||
echo ""
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check version
|
||||
echo ""
|
||||
log_info "Talos version:"
|
||||
talosctl --nodes "$NODE_IP" version --short 2>&1 || log_error "Could not get version"
|
||||
|
||||
# Check if in maintenance mode
|
||||
echo ""
|
||||
log_info "Checking if node is in maintenance mode..."
|
||||
if talosctl --nodes "$NODE_IP" get services &> /dev/null; then
|
||||
log_success "Node is OUT of maintenance mode (configured)"
|
||||
else
|
||||
log_error "Node is IN MAINTENANCE MODE - configuration not applied!"
|
||||
log_info "To apply config, run:"
|
||||
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
|
||||
fi
|
||||
|
||||
# Check services
|
||||
echo ""
|
||||
log_info "Service status:"
|
||||
talosctl --nodes "$NODE_IP" services 2>&1 | head -20 || log_error "Could not get services"
|
||||
|
||||
# Check etcd status
|
||||
echo ""
|
||||
log_info "etcd status:"
|
||||
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
||||
log_success "etcd is RUNNING"
|
||||
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE"
|
||||
else
|
||||
log_warning "etcd is NOT running"
|
||||
talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep "STATE" || log_info "etcd not initialized yet"
|
||||
fi
|
||||
|
||||
# Check if etcd members exist
|
||||
echo ""
|
||||
log_info "etcd members:"
|
||||
if talosctl --nodes "$NODE_IP" get members 2>&1 | grep -v "^NODE" | grep -v "not found"; then
|
||||
log_success "etcd members found"
|
||||
else
|
||||
log_warning "No etcd members - cluster needs bootstrap"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# Overall cluster status
|
||||
echo "==================== Overall Cluster Status ===================="
|
||||
|
||||
# Check if any node has etcd running
|
||||
ETCD_RUNNING=false
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
if talosctl --nodes "$NODE_IP" service etcd status 2>&1 | grep -q "STATE.*Running"; then
|
||||
ETCD_RUNNING=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
if $ETCD_RUNNING; then
|
||||
log_success "Cluster appears to be bootstrapped (etcd running)"
|
||||
|
||||
# Try to get kubeconfig
|
||||
echo ""
|
||||
log_info "Attempting to retrieve kubeconfig..."
|
||||
if talosctl kubeconfig --nodes "${CONTROL_PLANE_NODES[0]}" ./kubeconfig-test --force 2>&1; then
|
||||
log_success "Kubeconfig retrieved successfully"
|
||||
|
||||
log_info "Kubernetes node status:"
|
||||
KUBECONFIG=./kubeconfig-test kubectl get nodes 2>&1 || log_error "Could not connect to Kubernetes"
|
||||
|
||||
rm -f ./kubeconfig-test
|
||||
else
|
||||
log_warning "Could not retrieve kubeconfig"
|
||||
fi
|
||||
else
|
||||
log_warning "Cluster is NOT bootstrapped yet"
|
||||
log_info ""
|
||||
log_info "Next steps:"
|
||||
log_info "1. Ensure all nodes are out of maintenance mode (see checks above)"
|
||||
log_info "2. If nodes are in maintenance mode, apply configs:"
|
||||
for NODE_IP in "${CONTROL_PLANE_NODES[@]}"; do
|
||||
log_info " talosctl apply-config --insecure --nodes $NODE_IP --file testing1/controlplane-${NODE_IP}.yaml"
|
||||
done
|
||||
log_info "3. Wait for nodes to reboot and become ready (~2-5 minutes)"
|
||||
log_info "4. Bootstrap the cluster:"
|
||||
log_info " talosctl bootstrap --nodes ${CONTROL_PLANE_NODES[0]}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
Loading…
Reference in New Issue
Block a user