Compare commits

..

1 Commits

Author SHA1 Message Date
agent-company 3dac88ec90 docs: document patent PDF storage, add FileNotFoundError, commit lockfile
- Add docstring to analyze_single_patent explaining the PDF prerequisite
- Raise FileNotFoundError with helpful message when PDF is missing
- Add patent PDF storage section to README with Docker volume mount example
- Commit frontend/package-lock.json for reproducible builds

Closes leeworks-agents/SPARC#15
Closes leeworks-agents/SPARC#17

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 04:17:09 +00:00
6 changed files with 4788 additions and 51 deletions
+15
View File
@@ -54,6 +54,21 @@ docker-compose up -d
# - API Docs: http://localhost:8000/docs # - API Docs: http://localhost:8000/docs
``` ```
#### Patent PDF Storage
The API stores downloaded patent PDFs in a `patents/` directory. In Docker,
this is mounted as a bind mount (`./patents:/app/patents`) so that PDFs persist
across container restarts.
If you deploy to a different environment, ensure the `patents/` directory is a
persistent volume. Without it, PDFs will be re-downloaded on every analysis.
```yaml
# docker-compose.yml excerpt
volumes:
- ./patents:/app/patents
```
### NixOS ### NixOS
```bash ```bash
+35 -25
View File
@@ -5,13 +5,10 @@ to provide company performance estimation based on patent portfolios.
""" """
import hashlib import hashlib
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable from typing import Callable
from SPARC import config from SPARC import config
logger = logging.getLogger(__name__)
from SPARC.database import DatabaseClient from SPARC.database import DatabaseClient
from SPARC.serp_api import SERP from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer from SPARC.llm import LLMAnalyzer
@@ -55,13 +52,13 @@ class CompanyAnalyzer:
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest() query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
cached_ids = self.db.get_cached_serp_query(query_hash) cached_ids = self.db.get_cached_serp_query(query_hash)
if cached_ids is not None: if cached_ids is not None:
logger.info("Using cached SERP results for %s (%d patents)", company_name, len(cached_ids)) print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)")
patents = Patents(patents=[ patents = Patents(patents=[
Patent(patent_id=pid, pdf_link="") Patent(patent_id=pid, pdf_link="")
for pid in cached_ids for pid in cached_ids
]) ])
else: else:
logger.info("Retrieving patents for %s...", company_name) print(f"Retrieving patents for {company_name}...")
patents = SERP.query(company_name) patents = SERP.query(company_name)
# Cache the SERP results # Cache the SERP results
if patents.patents: if patents.patents:
@@ -69,13 +66,12 @@ class CompanyAnalyzer:
company_name=company_name, company_name=company_name,
query_hash=query_hash, query_hash=query_hash,
patent_ids=[p.patent_id for p in patents.patents], patent_ids=[p.patent_id for p in patents.patents],
ttl_hours=config.serp_cache_ttl_hours,
) )
if not patents.patents: if not patents.patents:
return f"No patents found for {company_name}" return f"No patents found for {company_name}"
logger.info("Found %d patents. Processing...", len(patents.patents)) print(f"Found {len(patents.patents)} patents. Processing...")
# Download, parse, and minimize patents in parallel # Download, parse, and minimize patents in parallel
processed_patents = [] processed_patents = []
@@ -91,12 +87,12 @@ class CompanyAnalyzer:
if result: if result:
processed_patents.append(result) processed_patents.append(result)
except Exception as e: except Exception as e:
logger.warning("Failed to process %s: %s", patent.patent_id, e) print(f"Warning: Failed to process {patent.patent_id}: {e}")
if not processed_patents: if not processed_patents:
return f"Failed to process any patents for {company_name}" return f"Failed to process any patents for {company_name}"
logger.info("Analyzing portfolio with LLM...") print(f"Analyzing portfolio with LLM...")
# Analyze the full portfolio with LLM # Analyze the full portfolio with LLM
analysis = self.llm_analyzer.analyze_patent_portfolio( analysis = self.llm_analyzer.analyze_patent_portfolio(
@@ -108,21 +104,33 @@ class CompanyAnalyzer:
def analyze_single_patent(self, patent_id: str, company_name: str) -> str: def analyze_single_patent(self, patent_id: str, company_name: str) -> str:
"""Analyze a single patent by ID. """Analyze a single patent by ID.
Useful for focused analysis of specific innovations. Prerequisite:
The patent PDF must already exist at ``patents/{patent_id}.pdf``
before calling this method. PDFs are downloaded automatically when
using the batch analysis pipeline (``analyze_company`` or the
``/analyze/batch`` API endpoint). For standalone usage, download
the PDF manually or call ``SERP.save_patents()`` first.
Args: Args:
patent_id: Publication ID of the patent patent_id: Publication ID of the patent (e.g. "US-11234567-B2")
company_name: Name of the company (for context) company_name: Name of the company (for context)
Returns: Returns:
Analysis of the specific patent's innovation quality Analysis of the specific patent's innovation quality
Raises:
FileNotFoundError: If the patent PDF is not found at the expected path.
""" """
# Note: This simplified version assumes the patent PDF is already downloaded import os
# A more complete implementation would support direct patent ID lookup
logger.info("Analyzing patent %s for %s...", patent_id, company_name)
patent_path = f"patents/{patent_id}.pdf" patent_path = f"patents/{patent_id}.pdf"
if not os.path.exists(patent_path):
raise FileNotFoundError(
f"Patent PDF not found at '{patent_path}'. "
f"Download the PDF first using SERP.save_patents() or the batch analysis pipeline."
)
try: try:
sections = SERP.parse_patent_pdf(patent_path) sections = SERP.parse_patent_pdf(patent_path)
minimized_content = SERP.minimize_patent_for_llm(sections) minimized_content = SERP.minimize_patent_for_llm(sections)
@@ -133,6 +141,8 @@ class CompanyAnalyzer:
return analysis return analysis
except FileNotFoundError:
raise
except Exception as e: except Exception as e:
return f"Failed to analyze patent {patent_id}: {e}" return f"Failed to analyze patent {patent_id}: {e}"
@@ -173,7 +183,7 @@ class CompanyAnalyzer:
return {"patent_id": patent.patent_id, "content": minimized_content} return {"patent_id": patent.patent_id, "content": minimized_content}
except Exception as e: except Exception as e:
logger.warning("Failed to process %s: %s", patent.patent_id, e) print(f"Warning: Failed to process {patent.patent_id}: {e}")
return None return None
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult: def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
@@ -244,7 +254,7 @@ class CompanyAnalyzer:
results: list[CompanyAnalysisResult] = [] results: list[CompanyAnalysisResult] = []
total = len(companies) total = len(companies)
logger.info("Starting batch analysis of %d companies...", total) print(f"Starting batch analysis of {total} companies...")
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_company = { future_to_company = {
@@ -261,8 +271,8 @@ class CompanyAnalyzer:
result = future.result() result = future.result()
results.append(result) results.append(result)
status = "OK" if result.success else "FAIL" status = "" if result.success else ""
logger.info("[%d/%d] %s %s", completed, total, status, company) print(f"[{completed}/{total}] {status} {company}")
if progress_callback: if progress_callback:
progress_callback(company, completed, total) progress_callback(company, completed, total)
@@ -277,12 +287,12 @@ class CompanyAnalyzer:
error=str(e), error=str(e),
) )
) )
logger.error("[%d/%d] FAIL %s: %s", completed, total, company, e) print(f"[{completed}/{total}] ✗ {company}: {e}")
successful = sum(1 for r in results if r.success) successful = sum(1 for r in results if r.success)
failed = total - successful failed = total - successful
logger.info("Batch complete: %d succeeded, %d failed", successful, failed) print(f"\nBatch complete: {successful} succeeded, {failed} failed")
return BatchAnalysisResult( return BatchAnalysisResult(
results=results, results=results,
@@ -308,20 +318,20 @@ class CompanyAnalyzer:
results: list[CompanyAnalysisResult] = [] results: list[CompanyAnalysisResult] = []
total = len(companies) total = len(companies)
logger.info("Starting sequential analysis of %d companies...", total) print(f"Starting sequential analysis of {total} companies...")
for idx, company in enumerate(companies, 1): for idx, company in enumerate(companies, 1):
logger.info("[%d/%d] Analyzing %s...", idx, total, company) print(f"\n[{idx}/{total}] Analyzing {company}...")
result = self._analyze_company_safe(company) result = self._analyze_company_safe(company)
results.append(result) results.append(result)
status = "OK" if result.success else "FAIL" status = "" if result.success else ""
logger.info("[%d/%d] %s %s", idx, total, status, company) print(f"[{idx}/{total}] {status} {company}")
successful = sum(1 for r in results if r.success) successful = sum(1 for r in results if r.success)
failed = total - successful failed = total - successful
logger.info("Batch complete: %d succeeded, %d failed", successful, failed) print(f"\nBatch complete: {successful} succeeded, {failed} failed")
return BatchAnalysisResult( return BatchAnalysisResult(
results=results, results=results,
+1 -16
View File
@@ -2,20 +2,11 @@
Loads environment variables from .env file for API keys and other secrets. Loads environment variables from .env file for API keys and other secrets.
""" """
import logging from dotenv import load_dotenv
import os import os
from dotenv import load_dotenv
load_dotenv() load_dotenv()
# Logging configuration
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=getattr(logging, log_level, logging.INFO),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
# SerpAPI key for patent search # SerpAPI key for patent search
api_key = os.getenv("API_KEY") api_key = os.getenv("API_KEY")
@@ -39,12 +30,6 @@ use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes"
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90")) patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5")) patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
# LLM model to use via OpenRouter (e.g. "anthropic/claude-3.5-sonnet", "openai/gpt-4o")
model = os.getenv("MODEL", "anthropic/claude-3.5-sonnet")
# SERP cache TTL in hours (how long cached search results are considered fresh)
serp_cache_ttl_hours = int(os.getenv("SERP_CACHE_TTL_HOURS", "24"))
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/) # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
# This ensures OpenAPI docs work correctly when accessed via the proxy # This ensures OpenAPI docs work correctly when accessed via the proxy
root_path = os.getenv("ROOT_PATH", "") root_path = os.getenv("ROOT_PATH", "")
+8 -9
View File
@@ -1,14 +1,9 @@
"""LLM integration for patent analysis using OpenRouter.""" """LLM integration for patent analysis using OpenRouter."""
import logging
from typing import Dict
from openai import OpenAI from openai import OpenAI
from SPARC import config from SPARC import config
from SPARC.database import DatabaseClient from SPARC.database import DatabaseClient
from typing import Dict
logger = logging.getLogger(__name__)
class LLMAnalyzer: class LLMAnalyzer:
@@ -25,7 +20,7 @@ class LLMAnalyzer:
""" """
self.test_mode = test_mode self.test_mode = test_mode
self.use_cache = use_cache if use_cache is not None else config.use_cache self.use_cache = use_cache if use_cache is not None else config.use_cache
self.model = config.model self.model = "anthropic/claude-3.5-sonnet"
# Always initialize database client for storage and caching # Always initialize database client for storage and caching
self.db_client = DatabaseClient(config.database_url) self.db_client = DatabaseClient(config.database_url)
@@ -64,7 +59,11 @@ Patent Content:
Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage.""" Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
if self.test_mode: if self.test_mode:
logger.debug("TEST MODE - Prompt that would be sent to LLM:\n%s", prompt) print("=" * 80)
print("TEST MODE - Prompt that would be sent to LLM:")
print("=" * 80)
print(prompt)
print("=" * 80)
return "[TEST MODE - No API call made]" return "[TEST MODE - No API call made]"
# Check cache first # Check cache first
@@ -166,7 +165,7 @@ Patent Portfolio:
Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook.""" Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
if self.test_mode: if self.test_mode:
logger.debug("TEST MODE - Portfolio prompt:\n%s", prompt) print(prompt)
return "[TEST MODE]" return "[TEST MODE]"
metadata = { metadata = {
+1 -1
View File
@@ -4,7 +4,7 @@ from datetime import datetime
@dataclass @dataclass
class Patent: class Patent:
patent_id: str patent_id: int
pdf_link: str pdf_link: str
pdf_path: str | None = None pdf_path: str | None = None
summary: dict | None = None summary: dict | None = None
+4728
View File
File diff suppressed because it is too large Load Diff