SPARC/tests/test_serp_api.py

"""Tests for SERP API patent retrieval and parsing functionality."""

from datetime import datetime, timedelta
from unittest.mock import Mock

from SPARC.serp_api import SERP
from SPARC.types import Patent


class TestTextCleaning:
    """Test patent text cleaning functionality."""

    def test_clean_patent_text_removes_figure_references(self):
        """Test that figure references are removed from text."""
        text = "This is a description (see FIG. 1) of the invention."
        cleaned = SERP.clean_patent_text(text)
        assert "(see FIG. 1)" not in cleaned
        assert "This is a description  of the invention." in cleaned

    def test_clean_patent_text_removes_fig_labels(self):
        """Test that FIG labels are removed from text."""
        text = "As shown in FIG. 2A the circuit operates."
        cleaned = SERP.clean_patent_text(text)
        assert "FIG. 2A" not in cleaned

    def test_clean_patent_text_removes_excessive_whitespace(self):
        """Test that excessive whitespace is normalized."""
        text = "Line 1\n\n\n\n\nLine 2"
        cleaned = SERP.clean_patent_text(text)
        assert "\n\n\n\n\n" not in cleaned
        assert "Line 1\n\nLine 2" in cleaned

    def test_clean_patent_text_removes_line_numbers(self):
        """Test that line numbers are removed from text."""
        text = "Some text\n42\nMore text"
        cleaned = SERP.clean_patent_text(text)
        # Line numbers on their own line should be removed
        assert cleaned.strip() != "Some text\n42\nMore text"


class TestSectionExtraction:
    """Test patent section extraction functionality."""

    def test_extract_section_finds_abstract(self):
        """Test extraction of abstract section."""
        text = """
        PATENT DOCUMENT

        ABSTRACT
        This is the abstract text describing the invention.

        BACKGROUND
        This is background information.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract text" in result
        assert "BACKGROUND" not in result

    def test_extract_section_finds_claims(self):
        """Test extraction of claims section."""
        text = """
        SUMMARY
        Summary text here.

        What is claimed is:
        1. A method comprising steps A and B.
        2. The method of claim 1, further comprising step C.

        ABSTRACT
        Abstract text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"What is claimed is:"],
            end_patterns=[r"ABSTRACT"],
        )
        assert "1. A method comprising" in result
        assert "2. The method of claim 1" in result
        assert "ABSTRACT" not in result

    def test_extract_section_returns_empty_when_not_found(self):
        """Test that empty string is returned when section not found."""
        text = "This text has no matching patterns."
        result = SERP.extract_section(
            text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
        )
        assert result == ""

    def test_extract_section_handles_case_insensitive(self):
        """Test that section extraction is case insensitive."""
        text = """
        abstract
        This is the abstract in lowercase.

        background
        Background text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract in lowercase" in result


class TestPatentMinimization:
    """Test patent content minimization for LLM consumption."""

    def test_minimize_includes_all_essential_sections(self):
        """Test that all essential sections are included in minimized output."""
        sections = {
            "abstract": "This is the abstract.",
            "claims": "1. A method for doing X.",
            "summary": "This invention relates to X.",
            "description": "Very long detailed description...",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "ABSTRACT:" in result
        assert "This is the abstract." in result
        assert "CLAIMS:" in result
        assert "1. A method for doing X." in result
        assert "SUMMARY:" in result
        assert "This invention relates to X." in result

    def test_minimize_excludes_description(self):
        """Test that detailed description is excluded from minimized output."""
        sections = {
            "abstract": "This is the abstract.",
            "claims": "1. A method for doing X.",
            "summary": "This invention relates to X.",
            "description": "Very long detailed description that should be excluded.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "Very long detailed description" not in result
        assert "DESCRIPTION:" not in result

    def test_minimize_handles_missing_sections(self):
        """Test that minimization handles missing sections gracefully."""
        sections = {
            "abstract": "This is the abstract.",
            # claims missing
            # summary missing
            "description": "Description text.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "ABSTRACT:" in result
        assert "This is the abstract." in result
        # Should not error on missing sections
        assert isinstance(result, str)

    def test_minimize_with_empty_sections(self):
        """Test that empty sections are handled properly."""
        sections = {
            "abstract": "",
            "claims": "1. A method.",
            "summary": "",
        }
        result = SERP.minimize_patent_for_llm(sections)

        # Empty sections should not appear
        assert result.count("CLAIMS:") == 1
        assert "1. A method." in result

    def test_minimize_separates_sections_with_double_newline(self):
        """Test that sections are properly separated."""
        sections = {
            "abstract": "Abstract text.",
            "claims": "Claims text.",
            "summary": "Summary text.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        # Sections should be separated by double newlines
        assert "\n\n" in result


class TestDynamicDateRange:
    """Test dynamic date range computation in SERP.query."""

    def test_query_uses_rolling_date_window(self, mocker):
        """Verify the date filter uses a rolling window, not hardcoded dates."""
        mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
        mock_search.return_value = {"organic_results": []}
        mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
        mocker.patch("SPARC.serp_api.config.patent_search_days", 90)

        SERP.query("TestCorp")

        call_params = mock_search.call_args[0][0]
        tbs = call_params["tbs"]
        # Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
        assert "cdr:1,cd_min:" in tbs
        assert "10/28/2025" not in tbs  # old hardcoded date gone

    def test_query_respects_days_back_param(self, mocker):
        """Verify days_back parameter controls the date window."""
        mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
        mock_search.return_value = {"organic_results": []}
        mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
        mocker.patch("SPARC.serp_api.config.patent_search_days", 90)

        now = datetime.now()
        SERP.query("TestCorp", days_back=30)

        call_params = mock_search.call_args[0][0]
        tbs = call_params["tbs"]
        expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
        assert expected_start in tbs


class TestFilesystemPDFCaching:
    """Test that save_patents skips download for existing files."""

    def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
        """Already-downloaded PDFs should not be re-downloaded."""
        mock_get = mocker.patch("SPARC.serp_api.requests.get")
        mocker.patch("SPARC.serp_api.os.makedirs")

        pdf_path = tmp_path / "US123.pdf"
        pdf_path.write_bytes(b"%PDF-1.4 fake content")

        mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
        mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)

        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)

        mock_get.assert_not_called()
        assert result.pdf_path == "patents/US123.pdf"

    def test_save_patents_downloads_when_not_cached(self, mocker):
        """Missing PDFs should be downloaded."""
        mock_response = Mock()
        mock_response.content = b"%PDF-1.4 content"
        mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
        mocker.patch("SPARC.serp_api.os.makedirs")
        mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
        mock_open = mocker.patch("builtins.open", mocker.mock_open())

        patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)

        mock_get.assert_called_once_with("http://example.com/test.pdf")
        assert result.pdf_path == "patents/US456.pdf"

    def test_save_patents_redownloads_empty_files(self, mocker):
        """Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
        mock_response = Mock()
        mock_response.content = b"%PDF-1.4 content"
        mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
        mocker.patch("SPARC.serp_api.os.makedirs")
        mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
        mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
        mock_open = mocker.patch("builtins.open", mocker.mock_open())

        patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)

        mock_get.assert_called_once()
        assert result.pdf_path == "patents/US789.pdf"