feat: implement company performance estimation orchestration

Created CompanyAnalyzer class that orchestrates the complete pipeline: 1. Retrieves patents via SERP API 2. Downloads and parses PDFs 3. Minimizes content (removes bloat) 4. Analyzes portfolio with LLM 5. Returns performance estimation Features: - Full company portfolio analysis - Single patent analysis support - Robust error handling (continues on partial failures) - Progress logging for user visibility Updated main.py with clean example usage demonstrating the high-level API. Added comprehensive test suite (7 tests) covering: - Full pipeline integration - Error handling at each stage - Single patent analysis - Edge cases (no patents, all failures) All 26 tests passing. This completes the core functionality for patent-based company performance estimation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 18:57:10 -05:00
parent d7cf80f02f
commit a91c3badab
3 changed files with 330 additions and 7 deletions
@@ -0,0 +1,112 @@
+"""High-level patent analysis orchestration.
+
+This module ties together patent retrieval, parsing, and LLM analysis
+to provide company performance estimation based on patent portfolios.
+"""
+
+from SPARC.serp_api import SERP
+from SPARC.llm import LLMAnalyzer
+from SPARC.types import Patent
+from typing import List
+
+
+class CompanyAnalyzer:
+    """Orchestrates end-to-end company performance analysis via patents."""
+
+    def __init__(self, anthropic_api_key: str | None = None):
+        """Initialize the company analyzer.
+
+        Args:
+          anthropic_api_key: Optional Anthropic API key. If None, loads from config.
+        """
+        self.llm_analyzer = LLMAnalyzer(api_key=anthropic_api_key)
+
+    def analyze_company(self, company_name: str) -> str:
+        """Analyze a company's performance based on their patent portfolio.
+
+        This is the main entry point that orchestrates the full pipeline:
+        1. Retrieve patents from SERP API
+        2. Download and parse each patent PDF
+        3. Minimize patent content (remove bloat)
+        4. Analyze portfolio with LLM
+        5. Return performance estimation
+
+        Args:
+          company_name: Name of the company to analyze
+
+        Returns:
+          Comprehensive analysis of company's innovation and performance outlook
+        """
+        print(f"Retrieving patents for {company_name}...")
+        patents = SERP.query(company_name)
+
+        if not patents.patents:
+            return f"No patents found for {company_name}"
+
+        print(f"Found {len(patents.patents)} patents. Processing...")
+
+        # Download and parse each patent
+        processed_patents = []
+        for idx, patent in enumerate(patents.patents, 1):
+            print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
+
+            try:
+                # Download PDF
+                patent = SERP.save_patents(patent)
+
+                # Parse sections from PDF
+                sections = SERP.parse_patent_pdf(patent.pdf_path)
+
+                # Minimize for LLM (remove bloat)
+                minimized_content = SERP.minimize_patent_for_llm(sections)
+
+                processed_patents.append(
+                    {"patent_id": patent.patent_id, "content": minimized_content}
+                )
+
+            except Exception as e:
+                print(f"Warning: Failed to process {patent.patent_id}: {e}")
+                continue
+
+        if not processed_patents:
+            return f"Failed to process any patents for {company_name}"
+
+        print(f"Analyzing portfolio with LLM...")
+
+        # Analyze the full portfolio with LLM
+        analysis = self.llm_analyzer.analyze_patent_portfolio(
+            patents_data=processed_patents, company_name=company_name
+        )
+
+        return analysis
+
+    def analyze_single_patent(self, patent_id: str, company_name: str) -> str:
+        """Analyze a single patent by ID.
+
+        Useful for focused analysis of specific innovations.
+
+        Args:
+          patent_id: Publication ID of the patent
+          company_name: Name of the company (for context)
+
+        Returns:
+          Analysis of the specific patent's innovation quality
+        """
+        # Note: This simplified version assumes the patent PDF is already downloaded
+        # A more complete implementation would support direct patent ID lookup
+        print(f"Analyzing patent {patent_id} for {company_name}...")
+
+        patent_path = f"patents/{patent_id}.pdf"
+
+        try:
+            sections = SERP.parse_patent_pdf(patent_path)
+            minimized_content = SERP.minimize_patent_for_llm(sections)
+
+            analysis = self.llm_analyzer.analyze_patent_content(
+                patent_content=minimized_content, company_name=company_name
+            )
+
+            return analysis
+
+        except Exception as e:
+            return f"Failed to analyze patent {patent_id}: {e}"