feat(jobs): persist async batch job state in PostgreSQL

- Add jobs table to database schema (job_id, status, progress, result_json, etc.) - Add DatabaseClient methods: create_job, update_job, get_job, list_jobs - Add mark_stale_jobs_failed() called at startup to handle interrupted jobs - Refactor _run_batch_job and job endpoints to read/write from PostgreSQL - Remove in-memory _jobs dict; job state now survives API restarts - Update init_database.py to list all tables in output Closes leeworks-agents/SPARC#8 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 04:22:57 +00:00
parent 6105ba7793
commit 96d5d27b17
4 changed files with 218 additions and 34 deletions
@@ -114,8 +114,7 @@ class AnalyticsResponse(BaseModel):
    period_days: int


-# In-memory job storage (for demo; production would use Redis/DB)
-_jobs: dict[str, JobStatus] = {}
+# Job counter for generating unique IDs (the actual state is in PostgreSQL)
 _job_counter = 0


@@ -148,9 +147,19 @@ _analyzer: CompanyAnalyzer | None = None

@asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Initialize resources on startup."""
+    """Initialize resources on startup, clean up on shutdown."""
    global _analyzer
    _analyzer = CompanyAnalyzer()
+    # Mark any jobs that were running/pending before the restart as failed
+    from SPARC.database import DatabaseClient
+    _db = DatabaseClient(config.database_url)
+    _db.connect()
+    _db.initialize_schema()
+    stale = _db.mark_stale_jobs_failed()
+    if stale:
+        import logging
+        logging.getLogger(__name__).warning("Marked %d stale jobs as failed on startup", stale)
+    _db.close()
    yield
    # Cleanup if needed
    _analyzer = None
@@ -422,20 +431,52 @@ async def analyze_companies_batch(
    return _convert_batch_result(result)


+def _get_job_db() -> "DatabaseClient":
+    """Get a DatabaseClient for job persistence."""
+    from SPARC.database import DatabaseClient
+    db = DatabaseClient(config.database_url)
+    return db
+
+
+def _job_row_to_status(row: dict) -> JobStatus:
+    """Convert a database job row to a JobStatus model."""
+    import json as _json
+    result = None
+    if row.get("result_json"):
+        result_data = row["result_json"]
+        if isinstance(result_data, str):
+            result_data = _json.loads(result_data)
+        result = BatchAnalysisResponse(**result_data)
+    return JobStatus(
+        job_id=row["job_id"],
+        status=row["status"],
+        progress=row["progress"],
+        total_companies=row["total_companies"],
+        completed_companies=row["completed_companies"],
+        result=result,
+        error=row.get("error"),
+    )
+
+
 def _run_batch_job(job_id: str, companies: list[str], max_workers: int):
    """Background task for batch analysis."""
-    global _jobs, _analyzer
+    import json as _json
+    global _analyzer
+
+    db = _get_job_db()

    if not _analyzer:
-        _jobs[job_id].status = "failed"
-        _jobs[job_id].error = "Analyzer not initialized"
+        db.update_job(job_id, status="failed", error="Analyzer not initialized")
        return

-    _jobs[job_id].status = "running"
+    db.update_job(job_id, status="running")

    def progress_callback(company: str, completed: int, total: int):
-        _jobs[job_id].completed_companies = completed
-        _jobs[job_id].progress = int((completed / total) * 100)
+        db.update_job(
+            job_id,
+            completed_companies=completed,
+            progress=int((completed / total) * 100),
+        )

    try:
        result = _analyzer.analyze_companies(
@@ -443,12 +484,15 @@ def _run_batch_job(job_id: str, companies: list[str], max_workers: int):
            max_workers=max_workers,
            progress_callback=progress_callback,
        )
-        _jobs[job_id].status = "completed"
-        _jobs[job_id].progress = 100
-        _jobs[job_id].result = _convert_batch_result(result)
+        batch_response = _convert_batch_result(result)
+        db.update_job(
+            job_id,
+            status="completed",
+            progress=100,
+            result_json=_json.dumps(batch_response.model_dump(), default=str),
+        )
    except Exception as e:
-        _jobs[job_id].status = "failed"
-        _jobs[job_id].error = str(e)
+        db.update_job(job_id, status="failed", error=str(e))


@app.post("/analyze/batch/async", response_model=JobStatus, tags=["Analysis"])
@@ -473,19 +517,14 @@ async def analyze_companies_async(
    _job_counter += 1
    job_id = f"job_{_job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"

-    _jobs[job_id] = JobStatus(
-        job_id=job_id,
-        status="pending",
-        progress=0,
-        total_companies=len(request.companies),
-        completed_companies=0,
-    )
+    db = _get_job_db()
+    job_row = db.create_job(job_id=job_id, total_companies=len(request.companies))

    background_tasks.add_task(
        _run_batch_job, job_id, request.companies, request.max_workers
    )

-    return _jobs[job_id]
+    return _job_row_to_status(job_row)


@app.get("/jobs/{job_id}", response_model=JobStatus, tags=["Jobs"])
@@ -501,10 +540,13 @@ async def get_job_status(
    Returns:
        Current job status including progress and results when complete
    """
-    if job_id not in _jobs:
+    db = _get_job_db()
+    job_row = db.get_job(job_id)
+
+    if not job_row:
        raise HTTPException(status_code=404, detail=f"Job {job_id} not found")

-    return _jobs[job_id]
+    return _job_row_to_status(job_row)


@app.get("/jobs", response_model=list[JobStatus], tags=["Jobs"])
@@ -525,12 +567,6 @@ async def list_jobs(
    Returns:
        List of job statuses
    """
-    jobs = list(_jobs.values())
-
-    if status:
-        jobs = [j for j in jobs if j.status == status]
-
-    # Return most recent first
-    jobs.sort(key=lambda j: j.job_id, reverse=True)
-
-    return jobs[:limit]
+    db = _get_job_db()
+    job_rows = db.list_jobs(status=status, limit=limit)
+    return [_job_row_to_status(row) for row in job_rows]