diff --git a/SPARC/analyzer.py b/SPARC/analyzer.py index 47ae7e2..ffbd68e 100644 --- a/SPARC/analyzer.py +++ b/SPARC/analyzer.py @@ -7,6 +7,7 @@ to provide company performance estimation based on patent portfolios. from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Callable +from SPARC import config from SPARC.serp_api import SERP from SPARC.llm import LLMAnalyzer from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult @@ -49,28 +50,21 @@ class CompanyAnalyzer: print(f"Found {len(patents.patents)} patents. Processing...") - # Download and parse each patent + # Download, parse, and minimize patents in parallel processed_patents = [] - for idx, patent in enumerate(patents.patents, 1): - print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}") - - try: - # Download PDF - patent = SERP.save_patents(patent) - - # Parse sections from PDF - sections = SERP.parse_patent_pdf(patent.pdf_path) - - # Minimize for LLM (remove bloat) - minimized_content = SERP.minimize_patent_for_llm(sections) - - processed_patents.append( - {"patent_id": patent.patent_id, "content": minimized_content} - ) - - except Exception as e: - print(f"Warning: Failed to process {patent.patent_id}: {e}") - continue + with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor: + future_to_patent = { + executor.submit(self._process_single_patent, patent): patent + for patent in patents.patents + } + for future in as_completed(future_to_patent): + patent = future_to_patent[future] + try: + result = future.result() + if result: + processed_patents.append(result) + except Exception as e: + print(f"Warning: Failed to process {patent.patent_id}: {e}") if not processed_patents: return f"Failed to process any patents for {company_name}" @@ -115,6 +109,22 @@ class CompanyAnalyzer: except Exception as e: return f"Failed to analyze patent {patent_id}: {e}" + @staticmethod + def _process_single_patent(patent: Patent) -> dict | None: + """Download, parse, and minimize a single patent. Thread-safe. + + Returns: + Dict with patent_id and minimized content, or None on failure. + """ + try: + patent = SERP.save_patents(patent) + sections = SERP.parse_patent_pdf(patent.pdf_path) + minimized_content = SERP.minimize_patent_for_llm(sections) + return {"patent_id": patent.patent_id, "content": minimized_content} + except Exception as e: + print(f"Warning: Failed to process {patent.patent_id}: {e}") + return None + def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult: """Internal wrapper that catches exceptions and returns structured result.