Compare commits

..

1 Commits

Author SHA1 Message Date
agent-company c4341ca8dc feat: configurable LLM model, SERP cache TTL, structured logging, fix patent_id type
- Make LLM model configurable via MODEL env var, default anthropic/claude-3.5-sonnet (#12)
- Expose SERP cache TTL as SERP_CACHE_TTL_HOURS env var, default 24 hours (#13)
- Fix Patent.patent_id type annotation from int to str in types.py (#14)
- Replace all print() calls with structured logging in analyzer.py and llm.py (#11)
- Add LOG_LEVEL config with basicConfig setup in config.py
- Add model and serp_cache_ttl_hours to config.py

Closes leeworks-agents/SPARC#11
Closes leeworks-agents/SPARC#12
Closes leeworks-agents/SPARC#13
Closes leeworks-agents/SPARC#14

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 04:12:00 +00:00
8 changed files with 81 additions and 245 deletions
+21 -17
View File
@@ -5,10 +5,13 @@ to provide company performance estimation based on patent portfolios.
""" """
import hashlib import hashlib
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable from typing import Callable
from SPARC import config from SPARC import config
logger = logging.getLogger(__name__)
from SPARC.database import DatabaseClient from SPARC.database import DatabaseClient
from SPARC.serp_api import SERP from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer from SPARC.llm import LLMAnalyzer
@@ -52,13 +55,13 @@ class CompanyAnalyzer:
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest() query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
cached_ids = self.db.get_cached_serp_query(query_hash) cached_ids = self.db.get_cached_serp_query(query_hash)
if cached_ids is not None: if cached_ids is not None:
print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)") logger.info("Using cached SERP results for %s (%d patents)", company_name, len(cached_ids))
patents = Patents(patents=[ patents = Patents(patents=[
Patent(patent_id=pid, pdf_link="") Patent(patent_id=pid, pdf_link="")
for pid in cached_ids for pid in cached_ids
]) ])
else: else:
print(f"Retrieving patents for {company_name}...") logger.info("Retrieving patents for %s...", company_name)
patents = SERP.query(company_name) patents = SERP.query(company_name)
# Cache the SERP results # Cache the SERP results
if patents.patents: if patents.patents:
@@ -66,12 +69,13 @@ class CompanyAnalyzer:
company_name=company_name, company_name=company_name,
query_hash=query_hash, query_hash=query_hash,
patent_ids=[p.patent_id for p in patents.patents], patent_ids=[p.patent_id for p in patents.patents],
ttl_hours=config.serp_cache_ttl_hours,
) )
if not patents.patents: if not patents.patents:
return f"No patents found for {company_name}" return f"No patents found for {company_name}"
print(f"Found {len(patents.patents)} patents. Processing...") logger.info("Found %d patents. Processing...", len(patents.patents))
# Download, parse, and minimize patents in parallel # Download, parse, and minimize patents in parallel
processed_patents = [] processed_patents = []
@@ -87,12 +91,12 @@ class CompanyAnalyzer:
if result: if result:
processed_patents.append(result) processed_patents.append(result)
except Exception as e: except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}") logger.warning("Failed to process %s: %s", patent.patent_id, e)
if not processed_patents: if not processed_patents:
return f"Failed to process any patents for {company_name}" return f"Failed to process any patents for {company_name}"
print(f"Analyzing portfolio with LLM...") logger.info("Analyzing portfolio with LLM...")
# Analyze the full portfolio with LLM # Analyze the full portfolio with LLM
analysis = self.llm_analyzer.analyze_patent_portfolio( analysis = self.llm_analyzer.analyze_patent_portfolio(
@@ -115,7 +119,7 @@ class CompanyAnalyzer:
""" """
# Note: This simplified version assumes the patent PDF is already downloaded # Note: This simplified version assumes the patent PDF is already downloaded
# A more complete implementation would support direct patent ID lookup # A more complete implementation would support direct patent ID lookup
print(f"Analyzing patent {patent_id} for {company_name}...") logger.info("Analyzing patent %s for %s...", patent_id, company_name)
patent_path = f"patents/{patent_id}.pdf" patent_path = f"patents/{patent_id}.pdf"
@@ -169,7 +173,7 @@ class CompanyAnalyzer:
return {"patent_id": patent.patent_id, "content": minimized_content} return {"patent_id": patent.patent_id, "content": minimized_content}
except Exception as e: except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}") logger.warning("Failed to process %s: %s", patent.patent_id, e)
return None return None
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult: def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
@@ -240,7 +244,7 @@ class CompanyAnalyzer:
results: list[CompanyAnalysisResult] = [] results: list[CompanyAnalysisResult] = []
total = len(companies) total = len(companies)
print(f"Starting batch analysis of {total} companies...") logger.info("Starting batch analysis of %d companies...", total)
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_company = { future_to_company = {
@@ -257,8 +261,8 @@ class CompanyAnalyzer:
result = future.result() result = future.result()
results.append(result) results.append(result)
status = "" if result.success else "" status = "OK" if result.success else "FAIL"
print(f"[{completed}/{total}] {status} {company}") logger.info("[%d/%d] %s %s", completed, total, status, company)
if progress_callback: if progress_callback:
progress_callback(company, completed, total) progress_callback(company, completed, total)
@@ -273,12 +277,12 @@ class CompanyAnalyzer:
error=str(e), error=str(e),
) )
) )
print(f"[{completed}/{total}] ✗ {company}: {e}") logger.error("[%d/%d] FAIL %s: %s", completed, total, company, e)
successful = sum(1 for r in results if r.success) successful = sum(1 for r in results if r.success)
failed = total - successful failed = total - successful
print(f"\nBatch complete: {successful} succeeded, {failed} failed") logger.info("Batch complete: %d succeeded, %d failed", successful, failed)
return BatchAnalysisResult( return BatchAnalysisResult(
results=results, results=results,
@@ -304,20 +308,20 @@ class CompanyAnalyzer:
results: list[CompanyAnalysisResult] = [] results: list[CompanyAnalysisResult] = []
total = len(companies) total = len(companies)
print(f"Starting sequential analysis of {total} companies...") logger.info("Starting sequential analysis of %d companies...", total)
for idx, company in enumerate(companies, 1): for idx, company in enumerate(companies, 1):
print(f"\n[{idx}/{total}] Analyzing {company}...") logger.info("[%d/%d] Analyzing %s...", idx, total, company)
result = self._analyze_company_safe(company) result = self._analyze_company_safe(company)
results.append(result) results.append(result)
status = "" if result.success else "" status = "OK" if result.success else "FAIL"
print(f"[{idx}/{total}] {status} {company}") logger.info("[%d/%d] %s %s", idx, total, status, company)
successful = sum(1 for r in results if r.success) successful = sum(1 for r in results if r.success)
failed = total - successful failed = total - successful
print(f"\nBatch complete: {successful} succeeded, {failed} failed") logger.info("Batch complete: %d succeeded, %d failed", successful, failed)
return BatchAnalysisResult( return BatchAnalysisResult(
results=results, results=results,
+33 -69
View File
@@ -114,7 +114,8 @@ class AnalyticsResponse(BaseModel):
period_days: int period_days: int
# Job counter for generating unique IDs (the actual state is in PostgreSQL) # In-memory job storage (for demo; production would use Redis/DB)
_jobs: dict[str, JobStatus] = {}
_job_counter = 0 _job_counter = 0
@@ -147,19 +148,9 @@ _analyzer: CompanyAnalyzer | None = None
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Initialize resources on startup, clean up on shutdown.""" """Initialize resources on startup."""
global _analyzer global _analyzer
_analyzer = CompanyAnalyzer() _analyzer = CompanyAnalyzer()
# Mark any jobs that were running/pending before the restart as failed
from SPARC.database import DatabaseClient
_db = DatabaseClient(config.database_url)
_db.connect()
_db.initialize_schema()
stale = _db.mark_stale_jobs_failed()
if stale:
import logging
logging.getLogger(__name__).warning("Marked %d stale jobs as failed on startup", stale)
_db.close()
yield yield
# Cleanup if needed # Cleanup if needed
_analyzer = None _analyzer = None
@@ -431,52 +422,20 @@ async def analyze_companies_batch(
return _convert_batch_result(result) return _convert_batch_result(result)
def _get_job_db() -> "DatabaseClient":
"""Get a DatabaseClient for job persistence."""
from SPARC.database import DatabaseClient
db = DatabaseClient(config.database_url)
return db
def _job_row_to_status(row: dict) -> JobStatus:
"""Convert a database job row to a JobStatus model."""
import json as _json
result = None
if row.get("result_json"):
result_data = row["result_json"]
if isinstance(result_data, str):
result_data = _json.loads(result_data)
result = BatchAnalysisResponse(**result_data)
return JobStatus(
job_id=row["job_id"],
status=row["status"],
progress=row["progress"],
total_companies=row["total_companies"],
completed_companies=row["completed_companies"],
result=result,
error=row.get("error"),
)
def _run_batch_job(job_id: str, companies: list[str], max_workers: int): def _run_batch_job(job_id: str, companies: list[str], max_workers: int):
"""Background task for batch analysis.""" """Background task for batch analysis."""
import json as _json global _jobs, _analyzer
global _analyzer
db = _get_job_db()
if not _analyzer: if not _analyzer:
db.update_job(job_id, status="failed", error="Analyzer not initialized") _jobs[job_id].status = "failed"
_jobs[job_id].error = "Analyzer not initialized"
return return
db.update_job(job_id, status="running") _jobs[job_id].status = "running"
def progress_callback(company: str, completed: int, total: int): def progress_callback(company: str, completed: int, total: int):
db.update_job( _jobs[job_id].completed_companies = completed
job_id, _jobs[job_id].progress = int((completed / total) * 100)
completed_companies=completed,
progress=int((completed / total) * 100),
)
try: try:
result = _analyzer.analyze_companies( result = _analyzer.analyze_companies(
@@ -484,15 +443,12 @@ def _run_batch_job(job_id: str, companies: list[str], max_workers: int):
max_workers=max_workers, max_workers=max_workers,
progress_callback=progress_callback, progress_callback=progress_callback,
) )
batch_response = _convert_batch_result(result) _jobs[job_id].status = "completed"
db.update_job( _jobs[job_id].progress = 100
job_id, _jobs[job_id].result = _convert_batch_result(result)
status="completed",
progress=100,
result_json=_json.dumps(batch_response.model_dump(), default=str),
)
except Exception as e: except Exception as e:
db.update_job(job_id, status="failed", error=str(e)) _jobs[job_id].status = "failed"
_jobs[job_id].error = str(e)
@app.post("/analyze/batch/async", response_model=JobStatus, tags=["Analysis"]) @app.post("/analyze/batch/async", response_model=JobStatus, tags=["Analysis"])
@@ -517,14 +473,19 @@ async def analyze_companies_async(
_job_counter += 1 _job_counter += 1
job_id = f"job_{_job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}" job_id = f"job_{_job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
db = _get_job_db() _jobs[job_id] = JobStatus(
job_row = db.create_job(job_id=job_id, total_companies=len(request.companies)) job_id=job_id,
status="pending",
progress=0,
total_companies=len(request.companies),
completed_companies=0,
)
background_tasks.add_task( background_tasks.add_task(
_run_batch_job, job_id, request.companies, request.max_workers _run_batch_job, job_id, request.companies, request.max_workers
) )
return _job_row_to_status(job_row) return _jobs[job_id]
@app.get("/jobs/{job_id}", response_model=JobStatus, tags=["Jobs"]) @app.get("/jobs/{job_id}", response_model=JobStatus, tags=["Jobs"])
@@ -540,13 +501,10 @@ async def get_job_status(
Returns: Returns:
Current job status including progress and results when complete Current job status including progress and results when complete
""" """
db = _get_job_db() if job_id not in _jobs:
job_row = db.get_job(job_id)
if not job_row:
raise HTTPException(status_code=404, detail=f"Job {job_id} not found") raise HTTPException(status_code=404, detail=f"Job {job_id} not found")
return _job_row_to_status(job_row) return _jobs[job_id]
@app.get("/jobs", response_model=list[JobStatus], tags=["Jobs"]) @app.get("/jobs", response_model=list[JobStatus], tags=["Jobs"])
@@ -567,6 +525,12 @@ async def list_jobs(
Returns: Returns:
List of job statuses List of job statuses
""" """
db = _get_job_db() jobs = list(_jobs.values())
job_rows = db.list_jobs(status=status, limit=limit)
return [_job_row_to_status(row) for row in job_rows] if status:
jobs = [j for j in jobs if j.status == status]
# Return most recent first
jobs.sort(key=lambda j: j.job_id, reverse=True)
return jobs[:limit]
+16 -1
View File
@@ -2,11 +2,20 @@
Loads environment variables from .env file for API keys and other secrets. Loads environment variables from .env file for API keys and other secrets.
""" """
from dotenv import load_dotenv import logging
import os import os
from dotenv import load_dotenv
load_dotenv() load_dotenv()
# Logging configuration
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=getattr(logging, log_level, logging.INFO),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
# SerpAPI key for patent search # SerpAPI key for patent search
api_key = os.getenv("API_KEY") api_key = os.getenv("API_KEY")
@@ -30,6 +39,12 @@ use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes"
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90")) patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5")) patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
# LLM model to use via OpenRouter (e.g. "anthropic/claude-3.5-sonnet", "openai/gpt-4o")
model = os.getenv("MODEL", "anthropic/claude-3.5-sonnet")
# SERP cache TTL in hours (how long cached search results are considered fresh)
serp_cache_ttl_hours = int(os.getenv("SERP_CACHE_TTL_HOURS", "24"))
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/) # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
# This ensures OpenAPI docs work correctly when accessed via the proxy # This ensures OpenAPI docs work correctly when accessed via the proxy
root_path = os.getenv("ROOT_PATH", "") root_path = os.getenv("ROOT_PATH", "")
-145
View File
@@ -171,26 +171,6 @@ class DatabaseClient:
ON serp_queries(query_hash) ON serp_queries(query_hash)
""") """)
# Create jobs table for persisting async batch job state
cursor.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id VARCHAR(128) PRIMARY KEY,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
progress INTEGER NOT NULL DEFAULT 0,
total_companies INTEGER NOT NULL DEFAULT 0,
completed_companies INTEGER NOT NULL DEFAULT 0,
result_json JSONB,
error TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status
ON jobs(status)
""")
self.conn.commit() self.conn.commit()
@staticmethod @staticmethod
@@ -482,131 +462,6 @@ class DatabaseClient:
) )
conn.commit() conn.commit()
# Job Persistence Methods
def create_job(
self,
job_id: str,
total_companies: int,
) -> Dict:
"""Create a new job record.
Args:
job_id: Unique job identifier
total_companies: Number of companies in the batch
Returns:
Job dict
"""
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(
"""
INSERT INTO jobs (job_id, status, progress, total_companies, completed_companies)
VALUES (%s, 'pending', 0, %s, 0)
RETURNING *
""",
(job_id, total_companies),
)
job = cursor.fetchone()
conn.commit()
return dict(job)
def update_job(
self,
job_id: str,
status: Optional[str] = None,
progress: Optional[int] = None,
completed_companies: Optional[int] = None,
result_json: Optional[str] = None,
error: Optional[str] = None,
) -> Optional[Dict]:
"""Update a job's state.
Only non-None fields are updated.
"""
updates = []
params = []
if status is not None:
updates.append("status = %s")
params.append(status)
if progress is not None:
updates.append("progress = %s")
params.append(progress)
if completed_companies is not None:
updates.append("completed_companies = %s")
params.append(completed_companies)
if result_json is not None:
updates.append("result_json = %s")
params.append(result_json)
if error is not None:
updates.append("error = %s")
params.append(error)
if not updates:
return self.get_job(job_id)
updates.append("updated_at = CURRENT_TIMESTAMP")
params.append(job_id)
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(
f"UPDATE jobs SET {', '.join(updates)} WHERE job_id = %s RETURNING *",
params,
)
job = cursor.fetchone()
conn.commit()
return dict(job) if job else None
def get_job(self, job_id: str) -> Optional[Dict]:
"""Get a job by ID."""
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute("SELECT * FROM jobs WHERE job_id = %s", (job_id,))
job = cursor.fetchone()
return dict(job) if job else None
def list_jobs(
self,
status: Optional[str] = None,
limit: int = 10,
) -> List[Dict]:
"""List jobs, optionally filtered by status."""
query = "SELECT * FROM jobs"
params: list = []
if status:
query += " WHERE status = %s"
params.append(status)
query += " ORDER BY created_at DESC LIMIT %s"
params.append(limit)
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
def mark_stale_jobs_failed(self) -> int:
"""Mark any jobs in 'running' or 'pending' state as 'failed'.
Called at startup to clean up jobs that were interrupted by a restart.
Returns:
Number of jobs marked as failed.
"""
with self.get_conn() as conn:
with conn.cursor() as cursor:
cursor.execute(
"""
UPDATE jobs SET status = 'failed', error = 'Interrupted by server restart',
updated_at = CURRENT_TIMESTAMP
WHERE status IN ('running', 'pending')
"""
)
count = cursor.rowcount
conn.commit()
return count
# User Authentication Methods # User Authentication Methods
@staticmethod @staticmethod
+9 -8
View File
@@ -1,9 +1,14 @@
"""LLM integration for patent analysis using OpenRouter.""" """LLM integration for patent analysis using OpenRouter."""
import logging
from typing import Dict
from openai import OpenAI from openai import OpenAI
from SPARC import config from SPARC import config
from SPARC.database import DatabaseClient from SPARC.database import DatabaseClient
from typing import Dict
logger = logging.getLogger(__name__)
class LLMAnalyzer: class LLMAnalyzer:
@@ -20,7 +25,7 @@ class LLMAnalyzer:
""" """
self.test_mode = test_mode self.test_mode = test_mode
self.use_cache = use_cache if use_cache is not None else config.use_cache self.use_cache = use_cache if use_cache is not None else config.use_cache
self.model = "anthropic/claude-3.5-sonnet" self.model = config.model
# Always initialize database client for storage and caching # Always initialize database client for storage and caching
self.db_client = DatabaseClient(config.database_url) self.db_client = DatabaseClient(config.database_url)
@@ -59,11 +64,7 @@ Patent Content:
Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage.""" Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
if self.test_mode: if self.test_mode:
print("=" * 80) logger.debug("TEST MODE - Prompt that would be sent to LLM:\n%s", prompt)
print("TEST MODE - Prompt that would be sent to LLM:")
print("=" * 80)
print(prompt)
print("=" * 80)
return "[TEST MODE - No API call made]" return "[TEST MODE - No API call made]"
# Check cache first # Check cache first
@@ -165,7 +166,7 @@ Patent Portfolio:
Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook.""" Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
if self.test_mode: if self.test_mode:
print(prompt) logger.debug("TEST MODE - Portfolio prompt:\n%s", prompt)
return "[TEST MODE]" return "[TEST MODE]"
metadata = { metadata = {
+1 -1
View File
@@ -4,7 +4,7 @@ from datetime import datetime
@dataclass @dataclass
class Patent: class Patent:
patent_id: int patent_id: str
pdf_link: str pdf_link: str
pdf_path: str | None = None pdf_path: str | None = None
summary: dict | None = None summary: dict | None = None
-3
View File
@@ -40,9 +40,6 @@ def main():
print("\nTables created:") print("\nTables created:")
print(" - llm_messages: Stores all LLM prompts and responses") print(" - llm_messages: Stores all LLM prompts and responses")
print(" - users: Stores user accounts") print(" - users: Stores user accounts")
print(" - jobs: Stores async batch job state")
print(" - patents: Patent PDF cache")
print(" - serp_queries: SERP query result cache")
print("\nIndexes created:") print("\nIndexes created:")
print(" - idx_messages_timestamp: For time-based queries") print(" - idx_messages_timestamp: For time-based queries")
print(" - idx_messages_company: For company-specific queries") print(" - idx_messages_company: For company-specific queries")
+1 -1
View File
@@ -5,7 +5,7 @@ from datetime import datetime
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from SPARC.api import app from SPARC.api import app, _analyzer, _jobs
from SPARC.types import CompanyAnalysisResult, BatchAnalysisResult from SPARC.types import CompanyAnalysisResult, BatchAnalysisResult