diff --git a/tests/test_analyze_single_patent.py b/tests/test_analyze_single_patent.py new file mode 100644 index 0000000..3b2283b --- /dev/null +++ b/tests/test_analyze_single_patent.py @@ -0,0 +1,211 @@ +"""Tests for analyze_single_patent auto-download path. + +Covers issue #1661: +- PDF exists on disk: direct analysis (happy path) +- PDF not on disk, cached link exists: auto-download and analyze +- PDF not on disk, no cached link: FileNotFoundError +- Analysis failure after PDF found: graceful error message +- Model override parameter passthrough +""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from SPARC.analyzer import CompanyAnalyzer +from SPARC.types import Patent + + +@pytest.fixture(autouse=True) +def mock_db(mocker): + """Mock DatabaseClient so no real DB is needed.""" + mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient") + mock_db_instance = MagicMock() + mock_db_instance.get_cached_patent.return_value = None + mock_db_instance.get_cached_serp_query.return_value = None + mock_db_cls.return_value = mock_db_instance + return mock_db_instance + + +@pytest.fixture +def analyzer(mocker, mock_db): + """Create a CompanyAnalyzer with mocked LLM and DB.""" + mocker.patch("SPARC.analyzer.LLMAnalyzer") + return CompanyAnalyzer(openrouter_api_key="test-key") + + +class TestAnalyzeSinglePatentAutoDownload: + """Test the auto-download logic in analyze_single_patent.""" + + def test_pdf_on_disk_analyzed_directly(self, analyzer, mocker, tmp_path): + """When PDF exists on disk, it is analyzed directly without download.""" + patent_id = "US-11234567-B2" + + # Create the patents dir and PDF file + patents_dir = tmp_path / "patents" + patents_dir.mkdir() + pdf_path = patents_dir / f"{patent_id}.pdf" + pdf_path.write_bytes(b"fake PDF content") + + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_parse.return_value = {"abstract": "test", "claims": "test claims"} + mock_minimize.return_value = "minimized content" + analyzer.llm_analyzer.analyze_patent_content.return_value = "Good patent." + + # Change cwd so patents/{patent_id}.pdf resolves to our tmp_path + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + result = analyzer.analyze_single_patent(patent_id, "TestCo") + finally: + os.chdir(original_cwd) + + assert result == "Good patent." + # DB cache should not have been queried since file existed + analyzer.db.get_cached_patent.assert_not_called() + + def test_auto_download_from_cached_link(self, analyzer, mocker, tmp_path): + """When PDF is not on disk but link is cached, auto-download occurs.""" + patent_id = "US-99887766-A1" + + # No patents dir exists (PDF not on disk) + mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents") + downloaded_patent = Patent(patent_id=patent_id, pdf_link="https://example.com/patent.pdf") + downloaded_patent.pdf_path = f"patents/{patent_id}.pdf" + mock_save.return_value = downloaded_patent + + # Cached patent has a PDF link + analyzer.db.get_cached_patent.return_value = { + "patent_id": patent_id, + "pdf_link": "https://example.com/patent.pdf", + } + + # Mock the rest of the analysis pipeline + mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf") + mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm") + mock_parse.return_value = {"abstract": "test abstract"} + mock_minimize.return_value = "minimized content" + analyzer.llm_analyzer.analyze_patent_content.return_value = "Strong innovation." + + # Change cwd so patents/{patent_id}.pdf does NOT exist + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + result = analyzer.analyze_single_patent(patent_id, "DownloadCo") + finally: + os.chdir(original_cwd) + + assert result == "Strong innovation." + analyzer.db.get_cached_patent.assert_called_once_with(patent_id) + mock_save.assert_called_once() + # Verify the Patent passed to save_patents has the correct ID and link + saved_patent = mock_save.call_args[0][0] + assert saved_patent.patent_id == patent_id + assert saved_patent.pdf_link == "https://example.com/patent.pdf" + + def test_no_cached_link_raises_file_not_found(self, analyzer, mocker, tmp_path): + """When PDF is not on disk and no cached link, FileNotFoundError raised.""" + patent_id = "US-00000000-X1" + + analyzer.db.get_cached_patent.return_value = None + + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + with pytest.raises(FileNotFoundError, match="no download link is cached"): + analyzer.analyze_single_patent(patent_id, "MissingCo") + finally: + os.chdir(original_cwd) + + def test_cached_patent_without_pdf_link_raises(self, analyzer, mocker, tmp_path): + """When cached patent exists but has no pdf_link, FileNotFoundError raised.""" + patent_id = "US-11111111-B1" + + analyzer.db.get_cached_patent.return_value = { + "patent_id": patent_id, + "pdf_link": None, + } + + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + with pytest.raises(FileNotFoundError, match="no download link is cached"): + analyzer.analyze_single_patent(patent_id, "NoPDFCo") + finally: + os.chdir(original_cwd) + + def test_analysis_exception_returns_error_message(self, analyzer, mocker, tmp_path): + """When analysis pipeline fails, returns error string instead of raising.""" + patent_id = "US-22222222-A2" + + # Create the PDF on disk so it skips download + patents_dir = tmp_path / "patents" + patents_dir.mkdir() + (patents_dir / f"{patent_id}.pdf").write_bytes(b"fake PDF") + + # Parse fails + mocker.patch( + "SPARC.analyzer.SERP.parse_patent_pdf", + side_effect=ValueError("Corrupt PDF"), + ) + + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + result = analyzer.analyze_single_patent(patent_id, "ErrorCo") + finally: + os.chdir(original_cwd) + + assert "Failed to analyze patent" in result + assert "Corrupt PDF" in result + + def test_model_override_passed_to_llm(self, analyzer, mocker, tmp_path): + """The model parameter is forwarded to the LLM analyzer.""" + patent_id = "US-33333333-B2" + + patents_dir = tmp_path / "patents" + patents_dir.mkdir() + (patents_dir / f"{patent_id}.pdf").write_bytes(b"fake PDF") + + mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf", return_value={"abstract": "test"}) + mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm", return_value="content") + analyzer.llm_analyzer.analyze_patent_content.return_value = "Analysis result." + + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + result = analyzer.analyze_single_patent( + patent_id, "ModelCo", model="openai/gpt-4o" + ) + finally: + os.chdir(original_cwd) + + assert result == "Analysis result." + analyzer.llm_analyzer.analyze_patent_content.assert_called_once_with( + patent_content="content", + company_name="ModelCo", + model="openai/gpt-4o", + ) + + def test_file_not_found_during_parse_re_raised(self, analyzer, mocker, tmp_path): + """FileNotFoundError during parsing is re-raised, not caught.""" + patent_id = "US-44444444-C1" + + patents_dir = tmp_path / "patents" + patents_dir.mkdir() + (patents_dir / f"{patent_id}.pdf").write_bytes(b"fake PDF") + + mocker.patch( + "SPARC.analyzer.SERP.parse_patent_pdf", + side_effect=FileNotFoundError("PDF file vanished"), + ) + + original_cwd = os.getcwd() + os.chdir(tmp_path) + try: + with pytest.raises(FileNotFoundError, match="PDF file vanished"): + analyzer.analyze_single_patent(patent_id, "VanishCo") + finally: + os.chdir(original_cwd)