test(analyzer,serp): add tests for caching, single query, and parallel processing

- Add TestSingleQueryBugFix: verify SERP.query called once per analysis
- Add TestPatentCaching: DB cache hit/miss, SERP query cache hit/miss
- Add TestDynamicDateRange: rolling window, days_back param
- Add TestFilesystemPDFCaching: skip download, redownload empty files
- Add autouse mock_db fixture to prevent real DB connections in all tests
This commit is contained in:
2026-03-24 14:39:09 -04:00
parent 1a297eb60b
commit 6f0b448044
2 changed files with 277 additions and 2 deletions
+187 -2
View File
@@ -1,11 +1,22 @@
"""Tests for the high-level company analyzer orchestration.""" """Tests for the high-level company analyzer orchestration."""
import pytest import pytest
from unittest.mock import Mock, patch, call from unittest.mock import Mock, patch, call, MagicMock
from SPARC.analyzer import CompanyAnalyzer from SPARC.analyzer import CompanyAnalyzer
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
@pytest.fixture(autouse=True)
def mock_db(mocker):
"""Mock DatabaseClient for all tests so no real DB connection is needed."""
mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient")
mock_db_instance = MagicMock()
mock_db_instance.get_cached_patent.return_value = None
mock_db_instance.get_cached_serp_query.return_value = None
mock_db_cls.return_value = mock_db_instance
return mock_db_instance
class TestCompanyAnalyzer: class TestCompanyAnalyzer:
"""Test the CompanyAnalyzer orchestration logic.""" """Test the CompanyAnalyzer orchestration logic."""
@@ -17,7 +28,7 @@ class TestCompanyAnalyzer:
mock_llm.assert_called_once_with(api_key="test-key") mock_llm.assert_called_once_with(api_key="test-key")
def test_analyze_company_full_pipeline(self, mocker): def test_analyze_company_full_pipeline(self, mocker, mock_db):
"""Test complete company analysis pipeline.""" """Test complete company analysis pipeline."""
# Mock all the dependencies # Mock all the dependencies
mock_query = mocker.patch("SPARC.analyzer.SERP.query") mock_query = mocker.patch("SPARC.analyzer.SERP.query")
@@ -178,6 +189,180 @@ class TestCompanyAnalyzer:
assert "PDF not found" in result assert "PDF not found" in result
class TestSingleQueryBugFix:
"""Test that SERP.query is only called once per company analysis."""
def test_analyze_company_safe_calls_query_once(self, mocker, mock_db):
"""_analyze_company_safe should call SERP.query exactly once."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer._analyze_company_safe("TestCorp")
# The key assertion: SERP.query called exactly once, not twice
mock_query.assert_called_once_with("TestCorp")
def test_analyze_company_with_prefetched_patents_skips_query(self, mocker):
"""analyze_company should not call SERP.query when patents are provided."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
prefetched = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer.analyze_company("TestCorp", patents=prefetched)
# SERP.query should never be called
mock_query.assert_not_called()
class TestPatentCaching:
"""Test patent-level DB caching in the pipeline."""
def test_process_single_patent_uses_db_cache(self, mocker, mock_db):
"""_process_single_patent returns cached content when available."""
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_db.get_cached_patent.return_value = {
"patent_id": "US123",
"minimized_content": "Cached minimized content",
}
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
assert result == {"patent_id": "US123", "content": "Cached minimized content"}
# Should NOT download since cache hit
mock_save.assert_not_called()
def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db):
"""_process_single_patent stores result in DB after processing."""
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
# No cache hit
mock_db.get_cached_patent.return_value = None
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test abstract"}
mock_minimize.return_value = "Minimized content"
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
assert result == {"patent_id": "US123", "content": "Minimized content"}
mock_db.store_patent.assert_called_once_with(
patent_id="US123",
company_name="TestCorp",
pdf_link="http://example.com/test.pdf",
raw_sections={"abstract": "Test abstract"},
minimized_content="Minimized content",
)
def test_serp_query_cache_hit_skips_api(self, mocker, mock_db):
"""When SERP query is cached, API call is skipped."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
# Simulate SERP cache hit
mock_db.get_cached_serp_query.return_value = ["US123"]
# Simulate patent cache hit too
mock_db.get_cached_patent.return_value = {
"patent_id": "US123",
"minimized_content": "Cached content",
}
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
result = analyzer.analyze_company("TestCorp")
assert result == "Analysis"
# SERP.query should NOT be called
mock_query.assert_not_called()
# No downloads should happen
mock_save.assert_not_called()
def test_serp_query_cache_miss_stores_result(self, mocker, mock_db):
"""When SERP query cache misses, result is stored after API call."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
mock_db.get_cached_serp_query.return_value = None
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer.analyze_company("TestCorp")
mock_db.store_serp_query.assert_called_once()
call_kwargs = mock_db.store_serp_query.call_args[1]
assert call_kwargs["company_name"] == "TestCorp"
assert call_kwargs["patent_ids"] == ["US123"]
class TestBatchProcessing: class TestBatchProcessing:
"""Test multi-company batch processing functionality.""" """Test multi-company batch processing functionality."""
+90
View File
@@ -1,7 +1,11 @@
"""Tests for SERP API patent retrieval and parsing functionality.""" """Tests for SERP API patent retrieval and parsing functionality."""
import os
import pytest import pytest
from unittest.mock import patch, Mock
from datetime import datetime, timedelta
from SPARC.serp_api import SERP from SPARC.serp_api import SERP
from SPARC.types import Patent
class TestTextCleaning: class TestTextCleaning:
@@ -176,3 +180,89 @@ class TestPatentMinimization:
# Sections should be separated by double newlines # Sections should be separated by double newlines
assert "\n\n" in result assert "\n\n" in result
class TestDynamicDateRange:
"""Test dynamic date range computation in SERP.query."""
def test_query_uses_rolling_date_window(self, mocker):
"""Verify the date filter uses a rolling window, not hardcoded dates."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
SERP.query("TestCorp")
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
assert "cdr:1,cd_min:" in tbs
assert "10/28/2025" not in tbs # old hardcoded date gone
def test_query_respects_days_back_param(self, mocker):
"""Verify days_back parameter controls the date window."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
now = datetime.now()
SERP.query("TestCorp", days_back=30)
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
assert expected_start in tbs
class TestFilesystemPDFCaching:
"""Test that save_patents skips download for existing files."""
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
"""Already-downloaded PDFs should not be re-downloaded."""
mock_get = mocker.patch("SPARC.serp_api.requests.get")
mocker.patch("SPARC.serp_api.os.makedirs")
pdf_path = tmp_path / "US123.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_not_called()
assert result.pdf_path == "patents/US123.pdf"
def test_save_patents_downloads_when_not_cached(self, mocker):
"""Missing PDFs should be downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once_with("http://example.com/test.pdf")
assert result.pdf_path == "patents/US456.pdf"
def test_save_patents_redownloads_empty_files(self, mocker):
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once()
assert result.pdf_path == "patents/US789.pdf"