feat: implement company performance estimation orchestration
Created CompanyAnalyzer class that orchestrates the complete pipeline: 1. Retrieves patents via SERP API 2. Downloads and parses PDFs 3. Minimizes content (removes bloat) 4. Analyzes portfolio with LLM 5. Returns performance estimation Features: - Full company portfolio analysis - Single patent analysis support - Robust error handling (continues on partial failures) - Progress logging for user visibility Updated main.py with clean example usage demonstrating the high-level API. Added comprehensive test suite (7 tests) covering: - Full pipeline integration - Error handling at each stage - Single patent analysis - Edge cases (no patents, all failures) All 26 tests passing. This completes the core functionality for patent-based company performance estimation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
d7cf80f02f
commit
a91c3badab
112
SPARC/analyzer.py
Normal file
112
SPARC/analyzer.py
Normal file
@ -0,0 +1,112 @@
|
||||
"""High-level patent analysis orchestration.
|
||||
|
||||
This module ties together patent retrieval, parsing, and LLM analysis
|
||||
to provide company performance estimation based on patent portfolios.
|
||||
"""
|
||||
|
||||
from SPARC.serp_api import SERP
|
||||
from SPARC.llm import LLMAnalyzer
|
||||
from SPARC.types import Patent
|
||||
from typing import List
|
||||
|
||||
|
||||
class CompanyAnalyzer:
|
||||
"""Orchestrates end-to-end company performance analysis via patents."""
|
||||
|
||||
def __init__(self, anthropic_api_key: str | None = None):
|
||||
"""Initialize the company analyzer.
|
||||
|
||||
Args:
|
||||
anthropic_api_key: Optional Anthropic API key. If None, loads from config.
|
||||
"""
|
||||
self.llm_analyzer = LLMAnalyzer(api_key=anthropic_api_key)
|
||||
|
||||
def analyze_company(self, company_name: str) -> str:
|
||||
"""Analyze a company's performance based on their patent portfolio.
|
||||
|
||||
This is the main entry point that orchestrates the full pipeline:
|
||||
1. Retrieve patents from SERP API
|
||||
2. Download and parse each patent PDF
|
||||
3. Minimize patent content (remove bloat)
|
||||
4. Analyze portfolio with LLM
|
||||
5. Return performance estimation
|
||||
|
||||
Args:
|
||||
company_name: Name of the company to analyze
|
||||
|
||||
Returns:
|
||||
Comprehensive analysis of company's innovation and performance outlook
|
||||
"""
|
||||
print(f"Retrieving patents for {company_name}...")
|
||||
patents = SERP.query(company_name)
|
||||
|
||||
if not patents.patents:
|
||||
return f"No patents found for {company_name}"
|
||||
|
||||
print(f"Found {len(patents.patents)} patents. Processing...")
|
||||
|
||||
# Download and parse each patent
|
||||
processed_patents = []
|
||||
for idx, patent in enumerate(patents.patents, 1):
|
||||
print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
|
||||
|
||||
try:
|
||||
# Download PDF
|
||||
patent = SERP.save_patents(patent)
|
||||
|
||||
# Parse sections from PDF
|
||||
sections = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
|
||||
# Minimize for LLM (remove bloat)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
|
||||
processed_patents.append(
|
||||
{"patent_id": patent.patent_id, "content": minimized_content}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
continue
|
||||
|
||||
if not processed_patents:
|
||||
return f"Failed to process any patents for {company_name}"
|
||||
|
||||
print(f"Analyzing portfolio with LLM...")
|
||||
|
||||
# Analyze the full portfolio with LLM
|
||||
analysis = self.llm_analyzer.analyze_patent_portfolio(
|
||||
patents_data=processed_patents, company_name=company_name
|
||||
)
|
||||
|
||||
return analysis
|
||||
|
||||
def analyze_single_patent(self, patent_id: str, company_name: str) -> str:
|
||||
"""Analyze a single patent by ID.
|
||||
|
||||
Useful for focused analysis of specific innovations.
|
||||
|
||||
Args:
|
||||
patent_id: Publication ID of the patent
|
||||
company_name: Name of the company (for context)
|
||||
|
||||
Returns:
|
||||
Analysis of the specific patent's innovation quality
|
||||
"""
|
||||
# Note: This simplified version assumes the patent PDF is already downloaded
|
||||
# A more complete implementation would support direct patent ID lookup
|
||||
print(f"Analyzing patent {patent_id} for {company_name}...")
|
||||
|
||||
patent_path = f"patents/{patent_id}.pdf"
|
||||
|
||||
try:
|
||||
sections = SERP.parse_patent_pdf(patent_path)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
|
||||
analysis = self.llm_analyzer.analyze_patent_content(
|
||||
patent_content=minimized_content, company_name=company_name
|
||||
)
|
||||
|
||||
return analysis
|
||||
|
||||
except Exception as e:
|
||||
return f"Failed to analyze patent {patent_id}: {e}"
|
||||
47
main.py
47
main.py
@ -1,10 +1,43 @@
|
||||
from SPARC.serp_api import SERP
|
||||
"""SPARC - Semiconductor Patent & Analytics Report Core
|
||||
|
||||
patents = SERP.query("nvidia")
|
||||
Example usage of the company performance analyzer.
|
||||
|
||||
for patent in patents.patents:
|
||||
patent = SERP.save_patents(patent)
|
||||
patent.summary = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
print(patent.summary)
|
||||
Before running:
|
||||
1. Create a .env file with:
|
||||
API_KEY=your_serpapi_key
|
||||
ANTHROPIC_API_KEY=your_anthropic_key
|
||||
|
||||
print(patents)
|
||||
2. Run: python main.py
|
||||
"""
|
||||
|
||||
from SPARC.analyzer import CompanyAnalyzer
|
||||
|
||||
|
||||
def main():
|
||||
"""Analyze a company's performance based on their patent portfolio."""
|
||||
|
||||
# Initialize the analyzer (loads API keys from .env)
|
||||
analyzer = CompanyAnalyzer()
|
||||
|
||||
# Analyze a company - this will:
|
||||
# 1. Retrieve patents from SERP API
|
||||
# 2. Download and parse patent PDFs
|
||||
# 3. Minimize content (remove bloat)
|
||||
# 4. Analyze with Claude to estimate performance
|
||||
company_name = "nvidia"
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"SPARC Patent Analysis - {company_name.upper()}")
|
||||
print(f"{'=' * 70}\n")
|
||||
|
||||
analysis = analyzer.analyze_company(company_name)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("ANALYSIS RESULTS")
|
||||
print(f"{'=' * 70}\n")
|
||||
print(analysis)
|
||||
print(f"\n{'=' * 70}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
178
tests/test_analyzer.py
Normal file
178
tests/test_analyzer.py
Normal file
@ -0,0 +1,178 @@
|
||||
"""Tests for the high-level company analyzer orchestration."""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
from SPARC.analyzer import CompanyAnalyzer
|
||||
from SPARC.types import Patent, Patents
|
||||
|
||||
|
||||
class TestCompanyAnalyzer:
|
||||
"""Test the CompanyAnalyzer orchestration logic."""
|
||||
|
||||
def test_analyzer_initialization(self, mocker):
|
||||
"""Test analyzer initialization with API key."""
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
analyzer = CompanyAnalyzer(anthropic_api_key="test-key")
|
||||
|
||||
mock_llm.assert_called_once_with(api_key="test-key")
|
||||
|
||||
def test_analyze_company_full_pipeline(self, mocker):
|
||||
"""Test complete company analysis pipeline."""
|
||||
# Mock all the dependencies
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
# Setup mock return values
|
||||
test_patent = Patent(
|
||||
patent_id="US123", pdf_link="http://example.com/test.pdf"
|
||||
)
|
||||
mock_query.return_value = Patents(patents=[test_patent])
|
||||
|
||||
test_patent.pdf_path = "patents/US123.pdf"
|
||||
mock_save.return_value = test_patent
|
||||
|
||||
mock_parse.return_value = {
|
||||
"abstract": "Test abstract",
|
||||
"claims": "Test claims",
|
||||
}
|
||||
|
||||
mock_minimize.return_value = "Minimized content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = (
|
||||
"Strong innovation portfolio"
|
||||
)
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
# Run the analysis
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_company("TestCorp")
|
||||
|
||||
# Verify the pipeline executed correctly
|
||||
assert result == "Strong innovation portfolio"
|
||||
mock_query.assert_called_once_with("TestCorp")
|
||||
mock_save.assert_called_once()
|
||||
mock_parse.assert_called_once_with("patents/US123.pdf")
|
||||
mock_minimize.assert_called_once()
|
||||
mock_llm_instance.analyze_patent_portfolio.assert_called_once()
|
||||
|
||||
# Verify the data passed to LLM
|
||||
llm_call_args = mock_llm_instance.analyze_patent_portfolio.call_args
|
||||
patents_data = llm_call_args[1]["patents_data"]
|
||||
assert len(patents_data) == 1
|
||||
assert patents_data[0]["patent_id"] == "US123"
|
||||
assert patents_data[0]["content"] == "Minimized content"
|
||||
|
||||
def test_analyze_company_no_patents_found(self, mocker):
|
||||
"""Test handling when no patents are found for a company."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_query.return_value = Patents(patents=[])
|
||||
mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_company("UnknownCorp")
|
||||
|
||||
assert result == "No patents found for UnknownCorp"
|
||||
|
||||
def test_analyze_company_handles_processing_errors(self, mocker):
|
||||
"""Test that analysis continues even if some patents fail to process."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
# Create two test patents
|
||||
patent1 = Patent(patent_id="US123", pdf_link="http://example.com/1.pdf")
|
||||
patent2 = Patent(patent_id="US456", pdf_link="http://example.com/2.pdf")
|
||||
mock_query.return_value = Patents(patents=[patent1, patent2])
|
||||
|
||||
# First patent processes successfully
|
||||
patent1.pdf_path = "patents/US123.pdf"
|
||||
|
||||
# Second patent raises an error
|
||||
def save_side_effect(p):
|
||||
if p.patent_id == "US123":
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
else:
|
||||
raise Exception("Download failed")
|
||||
|
||||
mock_save.side_effect = save_side_effect
|
||||
|
||||
mock_parse.return_value = {"abstract": "Test"}
|
||||
mock_minimize.return_value = "Content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis result"
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_company("TestCorp")
|
||||
|
||||
# Should still succeed with the one patent that worked
|
||||
assert result == "Analysis result"
|
||||
|
||||
# Verify only one patent was analyzed
|
||||
llm_call_args = mock_llm_instance.analyze_patent_portfolio.call_args
|
||||
patents_data = llm_call_args[1]["patents_data"]
|
||||
assert len(patents_data) == 1
|
||||
assert patents_data[0]["patent_id"] == "US123"
|
||||
|
||||
def test_analyze_company_all_patents_fail(self, mocker):
|
||||
"""Test handling when all patents fail to process."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/1.pdf")
|
||||
mock_query.return_value = Patents(patents=[patent])
|
||||
|
||||
# Make processing fail
|
||||
mock_save.side_effect = Exception("Processing error")
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_company("TestCorp")
|
||||
|
||||
assert result == "Failed to process any patents for TestCorp"
|
||||
|
||||
def test_analyze_single_patent(self, mocker):
|
||||
"""Test single patent analysis."""
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
mock_parse.return_value = {"abstract": "Test abstract"}
|
||||
mock_minimize.return_value = "Minimized content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_content.return_value = (
|
||||
"Innovative patent analysis"
|
||||
)
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_single_patent("US123", "TestCorp")
|
||||
|
||||
assert result == "Innovative patent analysis"
|
||||
mock_parse.assert_called_once_with("patents/US123.pdf")
|
||||
mock_llm_instance.analyze_patent_content.assert_called_once_with(
|
||||
patent_content="Minimized content", company_name="TestCorp"
|
||||
)
|
||||
|
||||
def test_analyze_single_patent_error_handling(self, mocker):
|
||||
"""Test single patent analysis with processing error."""
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
mock_parse.side_effect = FileNotFoundError("PDF not found")
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_single_patent("US999", "TestCorp")
|
||||
|
||||
assert "Failed to analyze patent US999" in result
|
||||
assert "PDF not found" in result
|
||||
Loading…
Reference in New Issue
Block a user