feat: add multi-company batch processing

- Add CompanyAnalysisResult and BatchAnalysisResult dataclasses
- Implement analyze_companies() for concurrent batch analysis
- Implement analyze_companies_sequential() for rate-limited scenarios
- Add progress callback support for monitoring batch jobs
- Include 5 new tests for batch processing functionality
- Fix pre-existing test mock issue in test_llm.py

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2026-03-12 23:23:07 -04:00
parent 84fd0bef32
commit 1c6d903301
5 changed files with 388 additions and 7 deletions
+30 -2
View File
@@ -15,7 +15,8 @@ SPARC automatically collects, parses, and analyzes patents from companies to pro
- **Content Minimization**: Removes verbose descriptions to reduce LLM token usage - **Content Minimization**: Removes verbose descriptions to reduce LLM token usage
- **AI Analysis**: Uses Claude 3.5 Sonnet via OpenRouter to analyze innovation quality and market potential - **AI Analysis**: Uses Claude 3.5 Sonnet via OpenRouter to analyze innovation quality and market potential
- **Portfolio Analysis**: Evaluates multiple patents holistically for comprehensive insights - **Portfolio Analysis**: Evaluates multiple patents holistically for comprehensive insights
- **Robust Testing**: 26 tests covering all major functionality - **Batch Processing**: Analyze multiple companies concurrently with progress tracking
- **Robust Testing**: 31 tests covering all major functionality
## Architecture ## Architecture
@@ -99,6 +100,33 @@ result = analyzer.analyze_single_patent(
) )
``` ```
### Multi-Company Batch Analysis
```python
from SPARC.analyzer import CompanyAnalyzer
analyzer = CompanyAnalyzer()
# Analyze multiple companies concurrently (default 3 workers)
batch_result = analyzer.analyze_companies(
["nvidia", "amd", "intel", "qualcomm"],
max_workers=3
)
# Access results
print(f"Analyzed: {batch_result.total_companies}")
print(f"Successful: {batch_result.successful}")
print(f"Failed: {batch_result.failed}")
for result in batch_result.results:
if result.success:
print(f"{result.company_name}: {result.patent_count} patents")
print(result.analysis)
# Or use sequential processing (safer for rate limits)
batch_result = analyzer.analyze_companies_sequential(["nvidia", "amd"])
```
## Running Tests ## Running Tests
```bash ```bash
@@ -130,7 +158,7 @@ pytest tests/ --cov=SPARC --cov-report=term-missing
- [X] Extract and minimize patent content - [X] Extract and minimize patent content
- [X] LLM integration for analysis - [X] LLM integration for analysis
- [X] Company performance estimation - [X] Company performance estimation
- [ ] Multi-company batch processing - [X] Multi-company batch processing
- [ ] FastAPI web service wrapper - [ ] FastAPI web service wrapper
- [X] Docker containerization - [X] Docker containerization
- [X] Results persistence (database) - [X] Results persistence (database)
+155 -2
View File
@@ -4,10 +4,12 @@ This module ties together patent retrieval, parsing, and LLM analysis
to provide company performance estimation based on patent portfolios. to provide company performance estimation based on patent portfolios.
""" """
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable
from SPARC.serp_api import SERP from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer from SPARC.llm import LLMAnalyzer
from SPARC.types import Patent from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult
from typing import List
class CompanyAnalyzer: class CompanyAnalyzer:
@@ -110,3 +112,154 @@ class CompanyAnalyzer:
except Exception as e: except Exception as e:
return f"Failed to analyze patent {patent_id}: {e}" return f"Failed to analyze patent {patent_id}: {e}"
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
"""Internal wrapper that catches exceptions and returns structured result.
Args:
company_name: Name of the company to analyze
Returns:
CompanyAnalysisResult with success/failure status
"""
try:
patents = SERP.query(company_name)
patent_count = len(patents.patents) if patents.patents else 0
analysis = self.analyze_company(company_name)
# Check if analysis indicates failure
if analysis.startswith("No patents found") or analysis.startswith(
"Failed to process"
):
return CompanyAnalysisResult(
company_name=company_name,
analysis=analysis,
patent_count=patent_count,
success=False,
error=analysis,
)
return CompanyAnalysisResult(
company_name=company_name,
analysis=analysis,
patent_count=patent_count,
success=True,
)
except Exception as e:
return CompanyAnalysisResult(
company_name=company_name,
analysis="",
patent_count=0,
success=False,
error=str(e),
)
def analyze_companies(
self,
companies: list[str],
max_workers: int = 3,
progress_callback: Callable[[str, int, int], None] | None = None,
) -> BatchAnalysisResult:
"""Analyze multiple companies' patent portfolios in batch.
Processes companies concurrently for improved performance while
respecting API rate limits.
Args:
companies: List of company names to analyze
max_workers: Maximum concurrent analyses (default 3 to avoid rate limits)
progress_callback: Optional callback(company_name, completed, total)
Returns:
BatchAnalysisResult containing all individual results and summary stats
"""
results: list[CompanyAnalysisResult] = []
total = len(companies)
print(f"Starting batch analysis of {total} companies...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_company = {
executor.submit(self._analyze_company_safe, company): company
for company in companies
}
completed = 0
for future in as_completed(future_to_company):
company = future_to_company[future]
completed += 1
try:
result = future.result()
results.append(result)
status = "" if result.success else ""
print(f"[{completed}/{total}] {status} {company}")
if progress_callback:
progress_callback(company, completed, total)
except Exception as e:
results.append(
CompanyAnalysisResult(
company_name=company,
analysis="",
patent_count=0,
success=False,
error=str(e),
)
)
print(f"[{completed}/{total}] ✗ {company}: {e}")
successful = sum(1 for r in results if r.success)
failed = total - successful
print(f"\nBatch complete: {successful} succeeded, {failed} failed")
return BatchAnalysisResult(
results=results,
total_companies=total,
successful=successful,
failed=failed,
)
def analyze_companies_sequential(
self, companies: list[str]
) -> BatchAnalysisResult:
"""Analyze multiple companies sequentially (safer for rate limits).
Use this when you want more control over API rate limiting or
when debugging issues.
Args:
companies: List of company names to analyze
Returns:
BatchAnalysisResult containing all individual results
"""
results: list[CompanyAnalysisResult] = []
total = len(companies)
print(f"Starting sequential analysis of {total} companies...")
for idx, company in enumerate(companies, 1):
print(f"\n[{idx}/{total}] Analyzing {company}...")
result = self._analyze_company_safe(company)
results.append(result)
status = "" if result.success else ""
print(f"[{idx}/{total}] {status} {company}")
successful = sum(1 for r in results if r.success)
failed = total - successful
print(f"\nBatch complete: {successful} succeeded, {failed} failed")
return BatchAnalysisResult(
results=results,
total_companies=total,
successful=successful,
failed=failed,
)
+25 -1
View File
@@ -1,4 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass, field
from datetime import datetime
@dataclass @dataclass
@@ -12,3 +13,26 @@ class Patent:
@dataclass @dataclass
class Patents: class Patents:
patents: list[Patent] patents: list[Patent]
@dataclass
class CompanyAnalysisResult:
"""Result of analyzing a single company's patent portfolio."""
company_name: str
analysis: str
patent_count: int
success: bool
error: str | None = None
timestamp: datetime = field(default_factory=datetime.now)
@dataclass
class BatchAnalysisResult:
"""Result of batch analyzing multiple companies."""
results: list[CompanyAnalysisResult]
total_companies: int
successful: int
failed: int
timestamp: datetime = field(default_factory=datetime.now)
+176 -2
View File
@@ -1,9 +1,9 @@
"""Tests for the high-level company analyzer orchestration.""" """Tests for the high-level company analyzer orchestration."""
import pytest import pytest
from unittest.mock import Mock, patch from unittest.mock import Mock, patch, call
from SPARC.analyzer import CompanyAnalyzer from SPARC.analyzer import CompanyAnalyzer
from SPARC.types import Patent, Patents from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
class TestCompanyAnalyzer: class TestCompanyAnalyzer:
@@ -176,3 +176,177 @@ class TestCompanyAnalyzer:
assert "Failed to analyze patent US999" in result assert "Failed to analyze patent US999" in result
assert "PDF not found" in result assert "PDF not found" in result
class TestBatchProcessing:
"""Test multi-company batch processing functionality."""
def test_analyze_companies_success(self, mocker):
"""Test batch analysis of multiple companies."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
# Setup mock returns
def query_side_effect(company):
patent = Patent(
patent_id=f"US-{company}",
pdf_link=f"http://example.com/{company}.pdf",
)
return Patents(patents=[patent])
mock_query.side_effect = query_side_effect
def save_side_effect(patent):
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
return patent
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis result"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
result = analyzer.analyze_companies(["CompanyA", "CompanyB"], max_workers=2)
assert isinstance(result, BatchAnalysisResult)
assert result.total_companies == 2
assert result.successful == 2
assert result.failed == 0
assert len(result.results) == 2
def test_analyze_companies_with_failures(self, mocker):
"""Test batch analysis handles partial failures."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mocker.patch("SPARC.analyzer.LLMAnalyzer")
def query_side_effect(company):
if company == "FailCorp":
return Patents(patents=[])
patent = Patent(
patent_id=f"US-{company}",
pdf_link=f"http://example.com/{company}.pdf",
)
return Patents(patents=[patent])
mock_query.side_effect = query_side_effect
analyzer = CompanyAnalyzer()
result = analyzer.analyze_companies(["GoodCorp", "FailCorp"], max_workers=1)
assert result.total_companies == 2
assert result.failed >= 1 # At least FailCorp should fail
# Find the failed result
fail_result = next(r for r in result.results if r.company_name == "FailCorp")
assert fail_result.success is False
def test_analyze_companies_sequential(self, mocker):
"""Test sequential batch analysis."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
def query_side_effect(company):
patent = Patent(
patent_id=f"US-{company}",
pdf_link=f"http://example.com/{company}.pdf",
)
return Patents(patents=[patent])
mock_query.side_effect = query_side_effect
def save_side_effect(patent):
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
return patent
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
result = analyzer.analyze_companies_sequential(["Corp1", "Corp2", "Corp3"])
assert result.total_companies == 3
assert len(result.results) == 3
def test_analyze_companies_progress_callback(self, mocker):
"""Test that progress callback is invoked correctly."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
def query_side_effect(company):
patent = Patent(
patent_id=f"US-{company}",
pdf_link=f"http://example.com/{company}.pdf",
)
return Patents(patents=[patent])
mock_query.side_effect = query_side_effect
def save_side_effect(patent):
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
return patent
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
callback = Mock()
analyzer = CompanyAnalyzer()
analyzer.analyze_companies(["A", "B"], max_workers=1, progress_callback=callback)
assert callback.call_count == 2
def test_company_analysis_result_structure(self, mocker):
"""Test CompanyAnalysisResult has correct structure."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Strong innovation"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
result = analyzer.analyze_companies(["TestCorp"], max_workers=1)
assert len(result.results) == 1
company_result = result.results[0]
assert company_result.company_name == "TestCorp"
assert company_result.analysis == "Strong innovation"
assert company_result.patent_count == 1
assert company_result.success is True
assert company_result.error is None
assert company_result.timestamp is not None
+2
View File
@@ -25,6 +25,8 @@ class TestLLMAnalyzer:
mock_openai = mocker.patch("SPARC.llm.OpenAI") mock_openai = mocker.patch("SPARC.llm.OpenAI")
mock_config = mocker.patch("SPARC.llm.config") mock_config = mocker.patch("SPARC.llm.config")
mock_config.openrouter_api_key = "config-key-456" mock_config.openrouter_api_key = "config-key-456"
mock_config.use_database = False
mock_config.database_url = "postgresql://localhost/test"
analyzer = LLMAnalyzer() analyzer = LLMAnalyzer()