diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index d81be6d..06dde64 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,11 +1,22 @@ """Tests for the high-level company analyzer orchestration.""" import pytest -from unittest.mock import Mock, patch, call +from unittest.mock import Mock, patch, call, MagicMock from SPARC.analyzer import CompanyAnalyzer from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult +@pytest.fixture(autouse=True) +def mock_db(mocker): + """Mock DatabaseClient for all tests so no real DB connection is needed.""" + mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient") + mock_db_instance = MagicMock() + mock_db_instance.get_cached_patent.return_value = None + mock_db_instance.get_cached_serp_query.return_value = None + mock_db_cls.return_value = mock_db_instance + return mock_db_instance + + class TestCompanyAnalyzer: """Test the CompanyAnalyzer orchestration logic.""" @@ -17,7 +28,7 @@ class TestCompanyAnalyzer: mock_llm.assert_called_once_with(api_key="test-key") - def test_analyze_company_full_pipeline(self, mocker): + def test_analyze_company_full_pipeline(self, mocker, mock_db): """Test complete company analysis pipeline.""" # Mock all the dependencies mock_query = mocker.patch("SPARC.analyzer.SERP.query") @@ -178,6 +189,180 @@ class TestCompanyAnalyzer: assert "PDF not found" in result +class TestSingleQueryBugFix: + """Test that SERP.query is only called once per company analysis.""" + + def test_analyze_company_safe_calls_query_once(self, mocker, mock_db): + """_analyze_company_safe should call SERP.query exactly once.""" + mock_query = mocker.patch("SPARC.analyzer.SERP.query") + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer") + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + mock_query.return_value = Patents(patents=[patent]) + + def save_side_effect(p): + p.pdf_path = "patents/US123.pdf" + return p + + mock_save.side_effect = save_side_effect + mock_parse.return_value = {"abstract": "Test"} + mock_minimize.return_value = "Content" + + mock_llm_instance = Mock() + mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis" + mock_llm.return_value = mock_llm_instance + + analyzer = CompanyAnalyzer() + analyzer._analyze_company_safe("TestCorp") + + # The key assertion: SERP.query called exactly once, not twice + mock_query.assert_called_once_with("TestCorp") + + def test_analyze_company_with_prefetched_patents_skips_query(self, mocker): + """analyze_company should not call SERP.query when patents are provided.""" + mock_query = mocker.patch("SPARC.analyzer.SERP.query") + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer") + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + prefetched = Patents(patents=[patent]) + + def save_side_effect(p): + p.pdf_path = "patents/US123.pdf" + return p + + mock_save.side_effect = save_side_effect + mock_parse.return_value = {"abstract": "Test"} + mock_minimize.return_value = "Content" + + mock_llm_instance = Mock() + mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis" + mock_llm.return_value = mock_llm_instance + + analyzer = CompanyAnalyzer() + analyzer.analyze_company("TestCorp", patents=prefetched) + + # SERP.query should never be called + mock_query.assert_not_called() + + +class TestPatentCaching: + """Test patent-level DB caching in the pipeline.""" + + def test_process_single_patent_uses_db_cache(self, mocker, mock_db): + """_process_single_patent returns cached content when available.""" + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + + mock_db.get_cached_patent.return_value = { + "patent_id": "US123", + "minimized_content": "Cached minimized content", + } + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db) + + assert result == {"patent_id": "US123", "content": "Cached minimized content"} + # Should NOT download since cache hit + mock_save.assert_not_called() + + def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db): + """_process_single_patent stores result in DB after processing.""" + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + + # No cache hit + mock_db.get_cached_patent.return_value = None + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + + def save_side_effect(p): + p.pdf_path = "patents/US123.pdf" + return p + + mock_save.side_effect = save_side_effect + mock_parse.return_value = {"abstract": "Test abstract"} + mock_minimize.return_value = "Minimized content" + + result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db) + + assert result == {"patent_id": "US123", "content": "Minimized content"} + mock_db.store_patent.assert_called_once_with( + patent_id="US123", + company_name="TestCorp", + pdf_link="http://example.com/test.pdf", + raw_sections={"abstract": "Test abstract"}, + minimized_content="Minimized content", + ) + + def test_serp_query_cache_hit_skips_api(self, mocker, mock_db): + """When SERP query is cached, API call is skipped.""" + mock_query = mocker.patch("SPARC.analyzer.SERP.query") + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer") + + # Simulate SERP cache hit + mock_db.get_cached_serp_query.return_value = ["US123"] + # Simulate patent cache hit too + mock_db.get_cached_patent.return_value = { + "patent_id": "US123", + "minimized_content": "Cached content", + } + + mock_llm_instance = Mock() + mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis" + mock_llm.return_value = mock_llm_instance + + analyzer = CompanyAnalyzer() + result = analyzer.analyze_company("TestCorp") + + assert result == "Analysis" + # SERP.query should NOT be called + mock_query.assert_not_called() + # No downloads should happen + mock_save.assert_not_called() + + def test_serp_query_cache_miss_stores_result(self, mocker, mock_db): + """When SERP query cache misses, result is stored after API call.""" + mock_query = mocker.patch("SPARC.analyzer.SERP.query") + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer") + + mock_db.get_cached_serp_query.return_value = None + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + mock_query.return_value = Patents(patents=[patent]) + + def save_side_effect(p): + p.pdf_path = "patents/US123.pdf" + return p + + mock_save.side_effect = save_side_effect + mock_parse.return_value = {"abstract": "Test"} + mock_minimize.return_value = "Content" + + mock_llm_instance = Mock() + mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis" + mock_llm.return_value = mock_llm_instance + + analyzer = CompanyAnalyzer() + analyzer.analyze_company("TestCorp") + + mock_db.store_serp_query.assert_called_once() + call_kwargs = mock_db.store_serp_query.call_args[1] + assert call_kwargs["company_name"] == "TestCorp" + assert call_kwargs["patent_ids"] == ["US123"] + + class TestBatchProcessing: """Test multi-company batch processing functionality.""" diff --git a/tests/test_serp_api.py b/tests/test_serp_api.py index 0454d58..e6d123d 100644 --- a/tests/test_serp_api.py +++ b/tests/test_serp_api.py @@ -1,7 +1,11 @@ """Tests for SERP API patent retrieval and parsing functionality.""" +import os import pytest +from unittest.mock import patch, Mock +from datetime import datetime, timedelta from SPARC.serp_api import SERP +from SPARC.types import Patent class TestTextCleaning: @@ -176,3 +180,89 @@ class TestPatentMinimization: # Sections should be separated by double newlines assert "\n\n" in result + + +class TestDynamicDateRange: + """Test dynamic date range computation in SERP.query.""" + + def test_query_uses_rolling_date_window(self, mocker): + """Verify the date filter uses a rolling window, not hardcoded dates.""" + mock_search = mocker.patch("SPARC.serp_api.serpapi.search") + mock_search.return_value = {"organic_results": []} + mocker.patch("SPARC.serp_api.config.api_key", "fake-key") + mocker.patch("SPARC.serp_api.config.patent_search_days", 90) + + SERP.query("TestCorp") + + call_params = mock_search.call_args[0][0] + tbs = call_params["tbs"] + # Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one + assert "cdr:1,cd_min:" in tbs + assert "10/28/2025" not in tbs # old hardcoded date gone + + def test_query_respects_days_back_param(self, mocker): + """Verify days_back parameter controls the date window.""" + mock_search = mocker.patch("SPARC.serp_api.serpapi.search") + mock_search.return_value = {"organic_results": []} + mocker.patch("SPARC.serp_api.config.api_key", "fake-key") + mocker.patch("SPARC.serp_api.config.patent_search_days", 90) + + now = datetime.now() + SERP.query("TestCorp", days_back=30) + + call_params = mock_search.call_args[0][0] + tbs = call_params["tbs"] + expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y") + assert expected_start in tbs + + +class TestFilesystemPDFCaching: + """Test that save_patents skips download for existing files.""" + + def test_save_patents_skips_download_when_cached(self, mocker, tmp_path): + """Already-downloaded PDFs should not be re-downloaded.""" + mock_get = mocker.patch("SPARC.serp_api.requests.get") + mocker.patch("SPARC.serp_api.os.makedirs") + + pdf_path = tmp_path / "US123.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake content") + + mocker.patch("SPARC.serp_api.os.path.exists", return_value=True) + mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100) + + patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") + result = SERP.save_patents(patent) + + mock_get.assert_not_called() + assert result.pdf_path == "patents/US123.pdf" + + def test_save_patents_downloads_when_not_cached(self, mocker): + """Missing PDFs should be downloaded.""" + mock_response = Mock() + mock_response.content = b"%PDF-1.4 content" + mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response) + mocker.patch("SPARC.serp_api.os.makedirs") + mocker.patch("SPARC.serp_api.os.path.exists", return_value=False) + mock_open = mocker.patch("builtins.open", mocker.mock_open()) + + patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf") + result = SERP.save_patents(patent) + + mock_get.assert_called_once_with("http://example.com/test.pdf") + assert result.pdf_path == "patents/US456.pdf" + + def test_save_patents_redownloads_empty_files(self, mocker): + """Empty/corrupt PDFs (0 bytes) should be re-downloaded.""" + mock_response = Mock() + mock_response.content = b"%PDF-1.4 content" + mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response) + mocker.patch("SPARC.serp_api.os.makedirs") + mocker.patch("SPARC.serp_api.os.path.exists", return_value=True) + mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0) + mock_open = mocker.patch("builtins.open", mocker.mock_open()) + + patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf") + result = SERP.save_patents(patent) + + mock_get.assert_called_once() + assert result.pdf_path == "patents/US789.pdf"