feat: implement company performance estimation orchestration

Created CompanyAnalyzer class that orchestrates the complete pipeline:
1. Retrieves patents via SERP API
2. Downloads and parses PDFs
3. Minimizes content (removes bloat)
4. Analyzes portfolio with LLM
5. Returns performance estimation

Features:
- Full company portfolio analysis
- Single patent analysis support
- Robust error handling (continues on partial failures)
- Progress logging for user visibility

Updated main.py with clean example usage demonstrating the high-level API.

Added comprehensive test suite (7 tests) covering:
- Full pipeline integration
- Error handling at each stage
- Single patent analysis
- Edge cases (no patents, all failures)

All 26 tests passing.

This completes the core functionality for patent-based company
performance estimation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2026-02-19 18:57:10 -05:00
parent d7cf80f02f
commit a91c3badab
3 changed files with 330 additions and 7 deletions
+112
View File
@@ -0,0 +1,112 @@
"""High-level patent analysis orchestration.
This module ties together patent retrieval, parsing, and LLM analysis
to provide company performance estimation based on patent portfolios.
"""
from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer
from SPARC.types import Patent
from typing import List
class CompanyAnalyzer:
"""Orchestrates end-to-end company performance analysis via patents."""
def __init__(self, anthropic_api_key: str | None = None):
"""Initialize the company analyzer.
Args:
anthropic_api_key: Optional Anthropic API key. If None, loads from config.
"""
self.llm_analyzer = LLMAnalyzer(api_key=anthropic_api_key)
def analyze_company(self, company_name: str) -> str:
"""Analyze a company's performance based on their patent portfolio.
This is the main entry point that orchestrates the full pipeline:
1. Retrieve patents from SERP API
2. Download and parse each patent PDF
3. Minimize patent content (remove bloat)
4. Analyze portfolio with LLM
5. Return performance estimation
Args:
company_name: Name of the company to analyze
Returns:
Comprehensive analysis of company's innovation and performance outlook
"""
print(f"Retrieving patents for {company_name}...")
patents = SERP.query(company_name)
if not patents.patents:
return f"No patents found for {company_name}"
print(f"Found {len(patents.patents)} patents. Processing...")
# Download and parse each patent
processed_patents = []
for idx, patent in enumerate(patents.patents, 1):
print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
try:
# Download PDF
patent = SERP.save_patents(patent)
# Parse sections from PDF
sections = SERP.parse_patent_pdf(patent.pdf_path)
# Minimize for LLM (remove bloat)
minimized_content = SERP.minimize_patent_for_llm(sections)
processed_patents.append(
{"patent_id": patent.patent_id, "content": minimized_content}
)
except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}")
continue
if not processed_patents:
return f"Failed to process any patents for {company_name}"
print(f"Analyzing portfolio with LLM...")
# Analyze the full portfolio with LLM
analysis = self.llm_analyzer.analyze_patent_portfolio(
patents_data=processed_patents, company_name=company_name
)
return analysis
def analyze_single_patent(self, patent_id: str, company_name: str) -> str:
"""Analyze a single patent by ID.
Useful for focused analysis of specific innovations.
Args:
patent_id: Publication ID of the patent
company_name: Name of the company (for context)
Returns:
Analysis of the specific patent's innovation quality
"""
# Note: This simplified version assumes the patent PDF is already downloaded
# A more complete implementation would support direct patent ID lookup
print(f"Analyzing patent {patent_id} for {company_name}...")
patent_path = f"patents/{patent_id}.pdf"
try:
sections = SERP.parse_patent_pdf(patent_path)
minimized_content = SERP.minimize_patent_for_llm(sections)
analysis = self.llm_analyzer.analyze_patent_content(
patent_content=minimized_content, company_name=company_name
)
return analysis
except Exception as e:
return f"Failed to analyze patent {patent_id}: {e}"