SPARC/SPARC/analyzer.py

"""High-level patent analysis orchestration.

This module ties together patent retrieval, parsing, and LLM analysis
to provide company performance estimation based on patent portfolios.
"""

from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer
from SPARC.types import Patent
from typing import List


class CompanyAnalyzer:
    """Orchestrates end-to-end company performance analysis via patents."""

    def __init__(self, openrouter_api_key: str | None = None):
        """Initialize the company analyzer.

        Args:
          openrouter_api_key: Optional OpenRouter API key. If None, loads from config.
        """
        self.llm_analyzer = LLMAnalyzer(api_key=openrouter_api_key)

    def analyze_company(self, company_name: str) -> str:
        """Analyze a company's performance based on their patent portfolio.

        This is the main entry point that orchestrates the full pipeline:
        1. Retrieve patents from SERP API
        2. Download and parse each patent PDF
        3. Minimize patent content (remove bloat)
        4. Analyze portfolio with LLM
        5. Return performance estimation

        Args:
          company_name: Name of the company to analyze

        Returns:
          Comprehensive analysis of company's innovation and performance outlook
        """
        print(f"Retrieving patents for {company_name}...")
        patents = SERP.query(company_name)

        if not patents.patents:
            return f"No patents found for {company_name}"

        print(f"Found {len(patents.patents)} patents. Processing...")

        # Download and parse each patent
        processed_patents = []
        for idx, patent in enumerate(patents.patents, 1):
            print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")

            try:
                # Download PDF
                patent = SERP.save_patents(patent)

                # Parse sections from PDF
                sections = SERP.parse_patent_pdf(patent.pdf_path)

                # Minimize for LLM (remove bloat)
                minimized_content = SERP.minimize_patent_for_llm(sections)

                processed_patents.append(
                    {"patent_id": patent.patent_id, "content": minimized_content}
                )

            except Exception as e:
                print(f"Warning: Failed to process {patent.patent_id}: {e}")
                continue

        if not processed_patents:
            return f"Failed to process any patents for {company_name}"

        print(f"Analyzing portfolio with LLM...")

        # Analyze the full portfolio with LLM
        analysis = self.llm_analyzer.analyze_patent_portfolio(
            patents_data=processed_patents, company_name=company_name
        )

        return analysis

    def analyze_single_patent(self, patent_id: str, company_name: str) -> str:
        """Analyze a single patent by ID.

        Useful for focused analysis of specific innovations.

        Args:
          patent_id: Publication ID of the patent
          company_name: Name of the company (for context)

        Returns:
          Analysis of the specific patent's innovation quality
        """
        # Note: This simplified version assumes the patent PDF is already downloaded
        # A more complete implementation would support direct patent ID lookup
        print(f"Analyzing patent {patent_id} for {company_name}...")

        patent_path = f"patents/{patent_id}.pdf"

        try:
            sections = SERP.parse_patent_pdf(patent_path)
            minimized_content = SERP.minimize_patent_for_llm(sections)

            analysis = self.llm_analyzer.analyze_patent_content(
                patent_content=minimized_content, company_name=company_name
            )

            return analysis

        except Exception as e:
            return f"Failed to analyze patent {patent_id}: {e}"