From ecc2c37bcd586437b67938642709f3569e3e0da2 Mon Sep 17 00:00:00 2001 From: agent-company Date: Thu, 26 Mar 2026 10:08:34 +0000 Subject: [PATCH] fix: auto-download patent PDF in analyze_single_patent before reading When the PDF is not on disk, analyze_single_patent now looks up the cached PDF link from the database and downloads it automatically. If no link is cached, a clear FileNotFoundError is raised. Also adds a GET /analyze/patent/{patent_id} API endpoint that exposes this functionality and returns 404 when the PDF cannot be obtained. Closes leeworks-agents/SPARC#36 Co-Authored-By: Claude Opus 4.6 (1M context) --- SPARC/analyzer.py | 32 +++++++++++++++++++++----------- SPARC/api.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/SPARC/analyzer.py b/SPARC/analyzer.py index 996558a..c55803b 100644 --- a/SPARC/analyzer.py +++ b/SPARC/analyzer.py @@ -108,12 +108,10 @@ class CompanyAnalyzer: def analyze_single_patent(self, patent_id: str, company_name: str) -> str: """Analyze a single patent by ID. - Prerequisite: - The patent PDF must already exist at ``patents/{patent_id}.pdf`` - before calling this method. PDFs are downloaded automatically when - using the batch analysis pipeline (``analyze_company`` or the - ``/analyze/batch`` API endpoint). For standalone usage, download - the PDF manually or call ``SERP.save_patents()`` first. + If the patent PDF is not already on disk, this method attempts to + download it automatically by looking up the PDF link in the database + cache. If the link is not cached either, a ``FileNotFoundError`` is + raised with instructions on how to obtain the PDF. Args: patent_id: Publication ID of the patent (e.g. "US-11234567-B2") @@ -123,7 +121,7 @@ class CompanyAnalyzer: Analysis of the specific patent's innovation quality Raises: - FileNotFoundError: If the patent PDF is not found at the expected path. + FileNotFoundError: If the patent PDF cannot be found or downloaded. """ import os logger.info("Analyzing patent %s for %s...", patent_id, company_name) @@ -131,10 +129,22 @@ class CompanyAnalyzer: patent_path = f"patents/{patent_id}.pdf" if not os.path.exists(patent_path): - raise FileNotFoundError( - f"Patent PDF not found at '{patent_path}'. " - f"Download the PDF first using SERP.save_patents() or the batch analysis pipeline." - ) + # Attempt to download the PDF automatically from cached metadata + cached = self.db.get_cached_patent(patent_id) + pdf_link = cached.get("pdf_link") if cached else None + + if pdf_link: + logger.info("PDF not on disk; downloading %s from cached link", patent_id) + patent = SERP.save_patents( + Patent(patent_id=patent_id, pdf_link=pdf_link) + ) + patent_path = patent.pdf_path + else: + raise FileNotFoundError( + f"Patent PDF not found at '{patent_path}' and no download link is " + f"cached for '{patent_id}'. Run a company analysis first to populate " + f"the cache, or call SERP.save_patents() with the patent's PDF link." + ) try: sections = SERP.parse_patent_pdf(patent_path) diff --git a/SPARC/api.py b/SPARC/api.py index a78c132..e4b7d42 100644 --- a/SPARC/api.py +++ b/SPARC/api.py @@ -429,6 +429,38 @@ async def analyze_company( return _convert_result(result) +@app.get( + "/analyze/patent/{patent_id}", + tags=["Analysis"], +) +async def analyze_single_patent( + patent_id: str, + company_name: str = Query(description="Company name for analysis context"), + _: UserResponse = Depends(get_current_user), +): + """Analyze a single patent by its publication ID. + + If the patent PDF is not already cached locally, the system will attempt + to download it automatically from a previously cached link. If no link + is available, a 404 error is returned. + + Args: + patent_id: Patent publication ID (e.g. "US-11234567-B2") + company_name: Company name for analysis context + + Returns: + Analysis text for the patent + """ + if not _analyzer: + raise HTTPException(status_code=503, detail="Analyzer not initialized") + + try: + analysis = _analyzer.analyze_single_patent(patent_id, company_name) + return {"patent_id": patent_id, "company_name": company_name, "analysis": analysis} + except FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + + @app.post( "/analyze/batch", response_model=BatchAnalysisResponse,