SPARC/tests/test_serp_api.py

"""Tests for SERP API patent retrieval and parsing functionality."""

import pytest
from SPARC.serp_api import SERP


class TestTextCleaning:
    """Test patent text cleaning functionality."""

    def test_clean_patent_text_removes_figure_references(self):
        """Test that figure references are removed from text."""
        text = "This is a description (see FIG. 1) of the invention."
        cleaned = SERP.clean_patent_text(text)
        assert "(see FIG. 1)" not in cleaned
        assert "This is a description  of the invention." in cleaned

    def test_clean_patent_text_removes_fig_labels(self):
        """Test that FIG labels are removed from text."""
        text = "As shown in FIG. 2A the circuit operates."
        cleaned = SERP.clean_patent_text(text)
        assert "FIG. 2A" not in cleaned

    def test_clean_patent_text_removes_excessive_whitespace(self):
        """Test that excessive whitespace is normalized."""
        text = "Line 1\n\n\n\n\nLine 2"
        cleaned = SERP.clean_patent_text(text)
        assert "\n\n\n\n\n" not in cleaned
        assert "Line 1\n\nLine 2" in cleaned

    def test_clean_patent_text_removes_line_numbers(self):
        """Test that line numbers are removed from text."""
        text = "Some text\n42\nMore text"
        cleaned = SERP.clean_patent_text(text)
        # Line numbers on their own line should be removed
        assert cleaned.strip() != "Some text\n42\nMore text"


class TestSectionExtraction:
    """Test patent section extraction functionality."""

    def test_extract_section_finds_abstract(self):
        """Test extraction of abstract section."""
        text = """
        PATENT DOCUMENT

        ABSTRACT
        This is the abstract text describing the invention.

        BACKGROUND
        This is background information.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract text" in result
        assert "BACKGROUND" not in result

    def test_extract_section_finds_claims(self):
        """Test extraction of claims section."""
        text = """
        SUMMARY
        Summary text here.

        What is claimed is:
        1. A method comprising steps A and B.
        2. The method of claim 1, further comprising step C.

        ABSTRACT
        Abstract text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"What is claimed is:"],
            end_patterns=[r"ABSTRACT"],
        )
        assert "1. A method comprising" in result
        assert "2. The method of claim 1" in result
        assert "ABSTRACT" not in result

    def test_extract_section_returns_empty_when_not_found(self):
        """Test that empty string is returned when section not found."""
        text = "This text has no matching patterns."
        result = SERP.extract_section(
            text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
        )
        assert result == ""

    def test_extract_section_handles_case_insensitive(self):
        """Test that section extraction is case insensitive."""
        text = """
        abstract
        This is the abstract in lowercase.

        background
        Background text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract in lowercase" in result


class TestPatentMinimization:
    """Test patent content minimization for LLM consumption."""

    def test_minimize_includes_all_essential_sections(self):
        """Test that all essential sections are included in minimized output."""
        sections = {
            "abstract": "This is the abstract.",
            "claims": "1. A method for doing X.",
            "summary": "This invention relates to X.",
            "description": "Very long detailed description...",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "ABSTRACT:" in result
        assert "This is the abstract." in result
        assert "CLAIMS:" in result
        assert "1. A method for doing X." in result
        assert "SUMMARY:" in result
        assert "This invention relates to X." in result

    def test_minimize_excludes_description(self):
        """Test that detailed description is excluded from minimized output."""
        sections = {
            "abstract": "This is the abstract.",
            "claims": "1. A method for doing X.",
            "summary": "This invention relates to X.",
            "description": "Very long detailed description that should be excluded.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "Very long detailed description" not in result
        assert "DESCRIPTION:" not in result

    def test_minimize_handles_missing_sections(self):
        """Test that minimization handles missing sections gracefully."""
        sections = {
            "abstract": "This is the abstract.",
            # claims missing
            # summary missing
            "description": "Description text.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        assert "ABSTRACT:" in result
        assert "This is the abstract." in result
        # Should not error on missing sections
        assert isinstance(result, str)

    def test_minimize_with_empty_sections(self):
        """Test that empty sections are handled properly."""
        sections = {
            "abstract": "",
            "claims": "1. A method.",
            "summary": "",
        }
        result = SERP.minimize_patent_for_llm(sections)

        # Empty sections should not appear
        assert result.count("CLAIMS:") == 1
        assert "1. A method." in result

    def test_minimize_separates_sections_with_double_newline(self):
        """Test that sections are properly separated."""
        sections = {
            "abstract": "Abstract text.",
            "claims": "Claims text.",
            "summary": "Summary text.",
        }
        result = SERP.minimize_patent_for_llm(sections)

        # Sections should be separated by double newlines
        assert "\n\n" in result