"""Tests for SERP API patent retrieval and parsing functionality.""" import os import pytest from unittest.mock import patch, Mock from datetime import datetime, timedelta from SPARC.serp_api import SERP from SPARC.types import Patent class TestTextCleaning: """Test patent text cleaning functionality.""" def test_clean_patent_text_removes_figure_references(self): """Test that figure references are removed from text.""" text = "This is a description (see FIG. 1) of the invention." cleaned = SERP.clean_patent_text(text) assert "(see FIG. 1)" not in cleaned assert "This is a description of the invention." in cleaned def test_clean_patent_text_removes_fig_labels(self): """Test that FIG labels are removed from text.""" text = "As shown in FIG. 2A the circuit operates." cleaned = SERP.clean_patent_text(text) assert "FIG. 2A" not in cleaned def test_clean_patent_text_removes_excessive_whitespace(self): """Test that excessive whitespace is normalized.""" text = "Line 1\n\n\n\n\nLine 2" cleaned = SERP.clean_patent_text(text) assert "\n\n\n\n\n" not in cleaned assert "Line 1\n\nLine 2" in cleaned def test_clean_patent_text_removes_line_numbers(self): """Test that line numbers are removed from text.""" text = "Some text\n42\nMore text" cleaned = SERP.clean_patent_text(text) # Line numbers on their own line should be removed assert cleaned.strip() != "Some text\n42\nMore text" class TestSectionExtraction: """Test patent section extraction functionality.""" def test_extract_section_finds_abstract(self): """Test extraction of abstract section.""" text = """ PATENT DOCUMENT ABSTRACT This is the abstract text describing the invention. BACKGROUND This is background information. """ result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"], ) assert "This is the abstract text" in result assert "BACKGROUND" not in result def test_extract_section_finds_claims(self): """Test extraction of claims section.""" text = """ SUMMARY Summary text here. What is claimed is: 1. A method comprising steps A and B. 2. The method of claim 1, further comprising step C. ABSTRACT Abstract text. """ result = SERP.extract_section( text, start_patterns=[r"What is claimed is:"], end_patterns=[r"ABSTRACT"], ) assert "1. A method comprising" in result assert "2. The method of claim 1" in result assert "ABSTRACT" not in result def test_extract_section_returns_empty_when_not_found(self): """Test that empty string is returned when section not found.""" text = "This text has no matching patterns." result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"] ) assert result == "" def test_extract_section_handles_case_insensitive(self): """Test that section extraction is case insensitive.""" text = """ abstract This is the abstract in lowercase. background Background text. """ result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"], ) assert "This is the abstract in lowercase" in result class TestPatentMinimization: """Test patent content minimization for LLM consumption.""" def test_minimize_includes_all_essential_sections(self): """Test that all essential sections are included in minimized output.""" sections = { "abstract": "This is the abstract.", "claims": "1. A method for doing X.", "summary": "This invention relates to X.", "description": "Very long detailed description...", } result = SERP.minimize_patent_for_llm(sections) assert "ABSTRACT:" in result assert "This is the abstract." in result assert "CLAIMS:" in result assert "1. A method for doing X." in result assert "SUMMARY:" in result assert "This invention relates to X." in result def test_minimize_excludes_description(self): """Test that detailed description is excluded from minimized output.""" sections = { "abstract": "This is the abstract.", "claims": "1. A method for doing X.", "summary": "This invention relates to X.", "description": "Very long detailed description that should be excluded.", } result = SERP.minimize_patent_for_llm(sections) assert "Very long detailed description" not in result assert "DESCRIPTION:" not in result def test_minimize_handles_missing_sections(self): """Test that minimization handles missing sections gracefully.""" sections = { "abstract": "This is the abstract.", # claims missing # summary missing "description": "Description text.", } result = SERP.minimize_patent_for_llm(sections) assert "ABSTRACT:" in result assert "This is the abstract." in result # Should not error on missing sections assert isinstance(result, str) def test_minimize_with_empty_sections(self): """Test that empty sections are handled properly.""" sections = { "abstract": "", "claims": "1. A method.", "summary": "", } result = SERP.minimize_patent_for_llm(sections) # Empty sections should not appear assert result.count("CLAIMS:") == 1 assert "1. A method." in result def test_minimize_separates_sections_with_double_newline(self): """Test that sections are properly separated.""" sections = { "abstract": "Abstract text.", "claims": "Claims text.", "summary": "Summary text.", } result = SERP.minimize_patent_for_llm(sections) # Sections should be separated by double newlines assert "\n\n" in result class TestDynamicDateRange: """Test dynamic date range computation in SERP.query.""" def test_query_uses_rolling_date_window(self, mocker): """Verify the date filter uses a rolling window, not hardcoded dates.""" mock_search = mocker.patch("SPARC.serp_api.serpapi.search") mock_search.return_value = {"organic_results": []} mocker.patch("SPARC.serp_api.config.api_key", "fake-key") mocker.patch("SPARC.serp_api.config.patent_search_days", 90) SERP.query("TestCorp") call_params = mock_search.call_args[0][0] tbs = call_params["tbs"] # Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one assert "cdr:1,cd_min:" in tbs assert "10/28/2025" not in tbs # old hardcoded date gone def test_query_respects_days_back_param(self, mocker): """Verify days_back parameter controls the date window.""" mock_search = mocker.patch("SPARC.serp_api.serpapi.search") mock_search.return_value = {"organic_results": []} mocker.patch("SPARC.serp_api.config.api_key", "fake-key") mocker.patch("SPARC.serp_api.config.patent_search_days", 90) now = datetime.now() SERP.query("TestCorp", days_back=30) call_params = mock_search.call_args[0][0] tbs = call_params["tbs"] expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y") assert expected_start in tbs class TestFilesystemPDFCaching: """Test that save_patents skips download for existing files.""" def test_save_patents_skips_download_when_cached(self, mocker, tmp_path): """Already-downloaded PDFs should not be re-downloaded.""" mock_get = mocker.patch("SPARC.serp_api.requests.get") mocker.patch("SPARC.serp_api.os.makedirs") pdf_path = tmp_path / "US123.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake content") mocker.patch("SPARC.serp_api.os.path.exists", return_value=True) mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100) patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf") result = SERP.save_patents(patent) mock_get.assert_not_called() assert result.pdf_path == "patents/US123.pdf" def test_save_patents_downloads_when_not_cached(self, mocker): """Missing PDFs should be downloaded.""" mock_response = Mock() mock_response.content = b"%PDF-1.4 content" mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response) mocker.patch("SPARC.serp_api.os.makedirs") mocker.patch("SPARC.serp_api.os.path.exists", return_value=False) mock_open = mocker.patch("builtins.open", mocker.mock_open()) patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf") result = SERP.save_patents(patent) mock_get.assert_called_once_with("http://example.com/test.pdf") assert result.pdf_path == "patents/US456.pdf" def test_save_patents_redownloads_empty_files(self, mocker): """Empty/corrupt PDFs (0 bytes) should be re-downloaded.""" mock_response = Mock() mock_response.content = b"%PDF-1.4 content" mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response) mocker.patch("SPARC.serp_api.os.makedirs") mocker.patch("SPARC.serp_api.os.path.exists", return_value=True) mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0) mock_open = mocker.patch("builtins.open", mocker.mock_open()) patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf") result = SERP.save_patents(patent) mock_get.assert_called_once() assert result.pdf_path == "patents/US789.pdf"