test(analyzer,serp): add tests for caching, single query, and parallel processing
- Add TestSingleQueryBugFix: verify SERP.query called once per analysis - Add TestPatentCaching: DB cache hit/miss, SERP query cache hit/miss - Add TestDynamicDateRange: rolling window, days_back param - Add TestFilesystemPDFCaching: skip download, redownload empty files - Add autouse mock_db fixture to prevent real DB connections in all tests
This commit is contained in:
+187
-2
@@ -1,11 +1,22 @@
|
|||||||
"""Tests for the high-level company analyzer orchestration."""
|
"""Tests for the high-level company analyzer orchestration."""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from unittest.mock import Mock, patch, call
|
from unittest.mock import Mock, patch, call, MagicMock
|
||||||
from SPARC.analyzer import CompanyAnalyzer
|
from SPARC.analyzer import CompanyAnalyzer
|
||||||
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
|
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_db(mocker):
|
||||||
|
"""Mock DatabaseClient for all tests so no real DB connection is needed."""
|
||||||
|
mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient")
|
||||||
|
mock_db_instance = MagicMock()
|
||||||
|
mock_db_instance.get_cached_patent.return_value = None
|
||||||
|
mock_db_instance.get_cached_serp_query.return_value = None
|
||||||
|
mock_db_cls.return_value = mock_db_instance
|
||||||
|
return mock_db_instance
|
||||||
|
|
||||||
|
|
||||||
class TestCompanyAnalyzer:
|
class TestCompanyAnalyzer:
|
||||||
"""Test the CompanyAnalyzer orchestration logic."""
|
"""Test the CompanyAnalyzer orchestration logic."""
|
||||||
|
|
||||||
@@ -17,7 +28,7 @@ class TestCompanyAnalyzer:
|
|||||||
|
|
||||||
mock_llm.assert_called_once_with(api_key="test-key")
|
mock_llm.assert_called_once_with(api_key="test-key")
|
||||||
|
|
||||||
def test_analyze_company_full_pipeline(self, mocker):
|
def test_analyze_company_full_pipeline(self, mocker, mock_db):
|
||||||
"""Test complete company analysis pipeline."""
|
"""Test complete company analysis pipeline."""
|
||||||
# Mock all the dependencies
|
# Mock all the dependencies
|
||||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||||
@@ -178,6 +189,180 @@ class TestCompanyAnalyzer:
|
|||||||
assert "PDF not found" in result
|
assert "PDF not found" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestSingleQueryBugFix:
|
||||||
|
"""Test that SERP.query is only called once per company analysis."""
|
||||||
|
|
||||||
|
def test_analyze_company_safe_calls_query_once(self, mocker, mock_db):
|
||||||
|
"""_analyze_company_safe should call SERP.query exactly once."""
|
||||||
|
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||||
|
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||||
|
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
mock_query.return_value = Patents(patents=[patent])
|
||||||
|
|
||||||
|
def save_side_effect(p):
|
||||||
|
p.pdf_path = "patents/US123.pdf"
|
||||||
|
return p
|
||||||
|
|
||||||
|
mock_save.side_effect = save_side_effect
|
||||||
|
mock_parse.return_value = {"abstract": "Test"}
|
||||||
|
mock_minimize.return_value = "Content"
|
||||||
|
|
||||||
|
mock_llm_instance = Mock()
|
||||||
|
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
|
||||||
|
analyzer = CompanyAnalyzer()
|
||||||
|
analyzer._analyze_company_safe("TestCorp")
|
||||||
|
|
||||||
|
# The key assertion: SERP.query called exactly once, not twice
|
||||||
|
mock_query.assert_called_once_with("TestCorp")
|
||||||
|
|
||||||
|
def test_analyze_company_with_prefetched_patents_skips_query(self, mocker):
|
||||||
|
"""analyze_company should not call SERP.query when patents are provided."""
|
||||||
|
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||||
|
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||||
|
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
prefetched = Patents(patents=[patent])
|
||||||
|
|
||||||
|
def save_side_effect(p):
|
||||||
|
p.pdf_path = "patents/US123.pdf"
|
||||||
|
return p
|
||||||
|
|
||||||
|
mock_save.side_effect = save_side_effect
|
||||||
|
mock_parse.return_value = {"abstract": "Test"}
|
||||||
|
mock_minimize.return_value = "Content"
|
||||||
|
|
||||||
|
mock_llm_instance = Mock()
|
||||||
|
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
|
||||||
|
analyzer = CompanyAnalyzer()
|
||||||
|
analyzer.analyze_company("TestCorp", patents=prefetched)
|
||||||
|
|
||||||
|
# SERP.query should never be called
|
||||||
|
mock_query.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
class TestPatentCaching:
|
||||||
|
"""Test patent-level DB caching in the pipeline."""
|
||||||
|
|
||||||
|
def test_process_single_patent_uses_db_cache(self, mocker, mock_db):
|
||||||
|
"""_process_single_patent returns cached content when available."""
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
|
||||||
|
mock_db.get_cached_patent.return_value = {
|
||||||
|
"patent_id": "US123",
|
||||||
|
"minimized_content": "Cached minimized content",
|
||||||
|
}
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
|
||||||
|
|
||||||
|
assert result == {"patent_id": "US123", "content": "Cached minimized content"}
|
||||||
|
# Should NOT download since cache hit
|
||||||
|
mock_save.assert_not_called()
|
||||||
|
|
||||||
|
def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db):
|
||||||
|
"""_process_single_patent stores result in DB after processing."""
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||||
|
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||||
|
|
||||||
|
# No cache hit
|
||||||
|
mock_db.get_cached_patent.return_value = None
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
|
||||||
|
def save_side_effect(p):
|
||||||
|
p.pdf_path = "patents/US123.pdf"
|
||||||
|
return p
|
||||||
|
|
||||||
|
mock_save.side_effect = save_side_effect
|
||||||
|
mock_parse.return_value = {"abstract": "Test abstract"}
|
||||||
|
mock_minimize.return_value = "Minimized content"
|
||||||
|
|
||||||
|
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
|
||||||
|
|
||||||
|
assert result == {"patent_id": "US123", "content": "Minimized content"}
|
||||||
|
mock_db.store_patent.assert_called_once_with(
|
||||||
|
patent_id="US123",
|
||||||
|
company_name="TestCorp",
|
||||||
|
pdf_link="http://example.com/test.pdf",
|
||||||
|
raw_sections={"abstract": "Test abstract"},
|
||||||
|
minimized_content="Minimized content",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_serp_query_cache_hit_skips_api(self, mocker, mock_db):
|
||||||
|
"""When SERP query is cached, API call is skipped."""
|
||||||
|
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||||
|
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||||
|
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||||
|
|
||||||
|
# Simulate SERP cache hit
|
||||||
|
mock_db.get_cached_serp_query.return_value = ["US123"]
|
||||||
|
# Simulate patent cache hit too
|
||||||
|
mock_db.get_cached_patent.return_value = {
|
||||||
|
"patent_id": "US123",
|
||||||
|
"minimized_content": "Cached content",
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_llm_instance = Mock()
|
||||||
|
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
|
||||||
|
analyzer = CompanyAnalyzer()
|
||||||
|
result = analyzer.analyze_company("TestCorp")
|
||||||
|
|
||||||
|
assert result == "Analysis"
|
||||||
|
# SERP.query should NOT be called
|
||||||
|
mock_query.assert_not_called()
|
||||||
|
# No downloads should happen
|
||||||
|
mock_save.assert_not_called()
|
||||||
|
|
||||||
|
def test_serp_query_cache_miss_stores_result(self, mocker, mock_db):
|
||||||
|
"""When SERP query cache misses, result is stored after API call."""
|
||||||
|
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||||
|
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||||
|
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||||
|
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||||
|
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||||
|
|
||||||
|
mock_db.get_cached_serp_query.return_value = None
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
mock_query.return_value = Patents(patents=[patent])
|
||||||
|
|
||||||
|
def save_side_effect(p):
|
||||||
|
p.pdf_path = "patents/US123.pdf"
|
||||||
|
return p
|
||||||
|
|
||||||
|
mock_save.side_effect = save_side_effect
|
||||||
|
mock_parse.return_value = {"abstract": "Test"}
|
||||||
|
mock_minimize.return_value = "Content"
|
||||||
|
|
||||||
|
mock_llm_instance = Mock()
|
||||||
|
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
|
||||||
|
analyzer = CompanyAnalyzer()
|
||||||
|
analyzer.analyze_company("TestCorp")
|
||||||
|
|
||||||
|
mock_db.store_serp_query.assert_called_once()
|
||||||
|
call_kwargs = mock_db.store_serp_query.call_args[1]
|
||||||
|
assert call_kwargs["company_name"] == "TestCorp"
|
||||||
|
assert call_kwargs["patent_ids"] == ["US123"]
|
||||||
|
|
||||||
|
|
||||||
class TestBatchProcessing:
|
class TestBatchProcessing:
|
||||||
"""Test multi-company batch processing functionality."""
|
"""Test multi-company batch processing functionality."""
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
"""Tests for SERP API patent retrieval and parsing functionality."""
|
"""Tests for SERP API patent retrieval and parsing functionality."""
|
||||||
|
|
||||||
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
from datetime import datetime, timedelta
|
||||||
from SPARC.serp_api import SERP
|
from SPARC.serp_api import SERP
|
||||||
|
from SPARC.types import Patent
|
||||||
|
|
||||||
|
|
||||||
class TestTextCleaning:
|
class TestTextCleaning:
|
||||||
@@ -176,3 +180,89 @@ class TestPatentMinimization:
|
|||||||
|
|
||||||
# Sections should be separated by double newlines
|
# Sections should be separated by double newlines
|
||||||
assert "\n\n" in result
|
assert "\n\n" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestDynamicDateRange:
|
||||||
|
"""Test dynamic date range computation in SERP.query."""
|
||||||
|
|
||||||
|
def test_query_uses_rolling_date_window(self, mocker):
|
||||||
|
"""Verify the date filter uses a rolling window, not hardcoded dates."""
|
||||||
|
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
||||||
|
mock_search.return_value = {"organic_results": []}
|
||||||
|
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
||||||
|
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
||||||
|
|
||||||
|
SERP.query("TestCorp")
|
||||||
|
|
||||||
|
call_params = mock_search.call_args[0][0]
|
||||||
|
tbs = call_params["tbs"]
|
||||||
|
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
|
||||||
|
assert "cdr:1,cd_min:" in tbs
|
||||||
|
assert "10/28/2025" not in tbs # old hardcoded date gone
|
||||||
|
|
||||||
|
def test_query_respects_days_back_param(self, mocker):
|
||||||
|
"""Verify days_back parameter controls the date window."""
|
||||||
|
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
||||||
|
mock_search.return_value = {"organic_results": []}
|
||||||
|
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
||||||
|
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
||||||
|
|
||||||
|
now = datetime.now()
|
||||||
|
SERP.query("TestCorp", days_back=30)
|
||||||
|
|
||||||
|
call_params = mock_search.call_args[0][0]
|
||||||
|
tbs = call_params["tbs"]
|
||||||
|
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
|
||||||
|
assert expected_start in tbs
|
||||||
|
|
||||||
|
|
||||||
|
class TestFilesystemPDFCaching:
|
||||||
|
"""Test that save_patents skips download for existing files."""
|
||||||
|
|
||||||
|
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
|
||||||
|
"""Already-downloaded PDFs should not be re-downloaded."""
|
||||||
|
mock_get = mocker.patch("SPARC.serp_api.requests.get")
|
||||||
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||||
|
|
||||||
|
pdf_path = tmp_path / "US123.pdf"
|
||||||
|
pdf_path.write_bytes(b"%PDF-1.4 fake content")
|
||||||
|
|
||||||
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
||||||
|
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||||
|
result = SERP.save_patents(patent)
|
||||||
|
|
||||||
|
mock_get.assert_not_called()
|
||||||
|
assert result.pdf_path == "patents/US123.pdf"
|
||||||
|
|
||||||
|
def test_save_patents_downloads_when_not_cached(self, mocker):
|
||||||
|
"""Missing PDFs should be downloaded."""
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.content = b"%PDF-1.4 content"
|
||||||
|
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
||||||
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||||
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
|
||||||
|
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
|
||||||
|
result = SERP.save_patents(patent)
|
||||||
|
|
||||||
|
mock_get.assert_called_once_with("http://example.com/test.pdf")
|
||||||
|
assert result.pdf_path == "patents/US456.pdf"
|
||||||
|
|
||||||
|
def test_save_patents_redownloads_empty_files(self, mocker):
|
||||||
|
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.content = b"%PDF-1.4 content"
|
||||||
|
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
||||||
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||||
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
||||||
|
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
|
||||||
|
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
|
||||||
|
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
|
||||||
|
result = SERP.save_patents(patent)
|
||||||
|
|
||||||
|
mock_get.assert_called_once()
|
||||||
|
assert result.pdf_path == "patents/US789.pdf"
|
||||||
|
|||||||
Reference in New Issue
Block a user