From 1a297eb60b85a1119765254ecdb48f6c55afc704 Mon Sep 17 00:00:00 2001 From: 0xWheatyz Date: Tue, 24 Mar 2026 14:35:24 -0400 Subject: [PATCH] feat(analyzer): integrate DB patent and SERP query caching Before querying SERP API, check serp_queries cache (24h TTL). Before downloading/parsing each patent, check patents table for cached minimized_content. Store results after processing so repeated analyses skip all network I/O and PDF parsing entirely. --- SPARC/analyzer.py | 59 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/SPARC/analyzer.py b/SPARC/analyzer.py index ffbd68e..0215b26 100644 --- a/SPARC/analyzer.py +++ b/SPARC/analyzer.py @@ -4,25 +4,31 @@ This module ties together patent retrieval, parsing, and LLM analysis to provide company performance estimation based on patent portfolios. """ +import hashlib from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Callable from SPARC import config +from SPARC.database import DatabaseClient from SPARC.serp_api import SERP from SPARC.llm import LLMAnalyzer -from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult +from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult class CompanyAnalyzer: """Orchestrates end-to-end company performance analysis via patents.""" - def __init__(self, openrouter_api_key: str | None = None): + def __init__(self, openrouter_api_key: str | None = None, db_client: DatabaseClient | None = None): """Initialize the company analyzer. Args: openrouter_api_key: Optional OpenRouter API key. If None, loads from config. + db_client: Optional DatabaseClient for patent caching. Created automatically if None. """ self.llm_analyzer = LLMAnalyzer(api_key=openrouter_api_key) + self.db = db_client or DatabaseClient(config.database_url) + self.db.connect() + self.db.initialize_schema() def analyze_company(self, company_name: str, patents: "Patents | None" = None) -> str: """Analyze a company's performance based on their patent portfolio. @@ -42,8 +48,25 @@ class CompanyAnalyzer: Comprehensive analysis of company's innovation and performance outlook """ if patents is None: - print(f"Retrieving patents for {company_name}...") - patents = SERP.query(company_name) + # Check SERP query cache first + query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest() + cached_ids = self.db.get_cached_serp_query(query_hash) + if cached_ids is not None: + print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)") + patents = Patents(patents=[ + Patent(patent_id=pid, pdf_link="") + for pid in cached_ids + ]) + else: + print(f"Retrieving patents for {company_name}...") + patents = SERP.query(company_name) + # Cache the SERP results + if patents.patents: + self.db.store_serp_query( + company_name=company_name, + query_hash=query_hash, + patent_ids=[p.patent_id for p in patents.patents], + ) if not patents.patents: return f"No patents found for {company_name}" @@ -54,7 +77,7 @@ class CompanyAnalyzer: processed_patents = [] with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor: future_to_patent = { - executor.submit(self._process_single_patent, patent): patent + executor.submit(self._process_single_patent, patent, company_name, self.db): patent for patent in patents.patents } for future in as_completed(future_to_patent): @@ -110,16 +133,40 @@ class CompanyAnalyzer: return f"Failed to analyze patent {patent_id}: {e}" @staticmethod - def _process_single_patent(patent: Patent) -> dict | None: + def _process_single_patent( + patent: Patent, + company_name: str = "", + db: DatabaseClient | None = None, + ) -> dict | None: """Download, parse, and minimize a single patent. Thread-safe. + Checks DB cache before downloading. Stores results after processing. + Returns: Dict with patent_id and minimized content, or None on failure. """ try: + # Check DB cache first + if db: + cached = db.get_cached_patent(patent.patent_id) + if cached and cached.get("minimized_content"): + return {"patent_id": patent.patent_id, "content": cached["minimized_content"]} + + # Full processing: download, parse, minimize patent = SERP.save_patents(patent) sections = SERP.parse_patent_pdf(patent.pdf_path) minimized_content = SERP.minimize_patent_for_llm(sections) + + # Store in DB cache + if db: + db.store_patent( + patent_id=patent.patent_id, + company_name=company_name, + pdf_link=patent.pdf_link, + raw_sections=sections, + minimized_content=minimized_content, + ) + return {"patent_id": patent.patent_id, "content": minimized_content} except Exception as e: print(f"Warning: Failed to process {patent.patent_id}: {e}")