perf(analyzer): parallelize patent download/parse/minimize with threads
Replace the sequential per-patent loop with a ThreadPoolExecutor (workers controlled by PATENT_THREAD_WORKERS config). Each patent is processed independently in _process_single_patent, which is thread-safe since SERP methods are stateless and operate on separate files.
This commit is contained in:
+28
-18
@@ -7,6 +7,7 @@ to provide company performance estimation based on patent portfolios.
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Callable
|
||||
|
||||
from SPARC import config
|
||||
from SPARC.serp_api import SERP
|
||||
from SPARC.llm import LLMAnalyzer
|
||||
from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult
|
||||
@@ -49,28 +50,21 @@ class CompanyAnalyzer:
|
||||
|
||||
print(f"Found {len(patents.patents)} patents. Processing...")
|
||||
|
||||
# Download and parse each patent
|
||||
# Download, parse, and minimize patents in parallel
|
||||
processed_patents = []
|
||||
for idx, patent in enumerate(patents.patents, 1):
|
||||
print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor:
|
||||
future_to_patent = {
|
||||
executor.submit(self._process_single_patent, patent): patent
|
||||
for patent in patents.patents
|
||||
}
|
||||
for future in as_completed(future_to_patent):
|
||||
patent = future_to_patent[future]
|
||||
try:
|
||||
# Download PDF
|
||||
patent = SERP.save_patents(patent)
|
||||
|
||||
# Parse sections from PDF
|
||||
sections = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
|
||||
# Minimize for LLM (remove bloat)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
|
||||
processed_patents.append(
|
||||
{"patent_id": patent.patent_id, "content": minimized_content}
|
||||
)
|
||||
|
||||
result = future.result()
|
||||
if result:
|
||||
processed_patents.append(result)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
continue
|
||||
|
||||
if not processed_patents:
|
||||
return f"Failed to process any patents for {company_name}"
|
||||
@@ -115,6 +109,22 @@ class CompanyAnalyzer:
|
||||
except Exception as e:
|
||||
return f"Failed to analyze patent {patent_id}: {e}"
|
||||
|
||||
@staticmethod
|
||||
def _process_single_patent(patent: Patent) -> dict | None:
|
||||
"""Download, parse, and minimize a single patent. Thread-safe.
|
||||
|
||||
Returns:
|
||||
Dict with patent_id and minimized content, or None on failure.
|
||||
"""
|
||||
try:
|
||||
patent = SERP.save_patents(patent)
|
||||
sections = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
return {"patent_id": patent.patent_id, "content": minimized_content}
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
return None
|
||||
|
||||
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
|
||||
"""Internal wrapper that catches exceptions and returns structured result.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user