SPARC/tests/test_serp_api.py

"""Tests for SERP API patent retrieval and parsing functionality."""

import pytest
from SPARC.serp_api import SERP


class TestTextCleaning:
    """Test patent text cleaning functionality."""

    def test_clean_patent_text_removes_figure_references(self):
        """Test that figure references are removed from text."""
        text = "This is a description (see FIG. 1) of the invention."
        cleaned = SERP.clean_patent_text(text)
        assert "(see FIG. 1)" not in cleaned
        assert "This is a description  of the invention." in cleaned

    def test_clean_patent_text_removes_fig_labels(self):
        """Test that FIG labels are removed from text."""
        text = "As shown in FIG. 2A the circuit operates."
        cleaned = SERP.clean_patent_text(text)
        assert "FIG. 2A" not in cleaned

    def test_clean_patent_text_removes_excessive_whitespace(self):
        """Test that excessive whitespace is normalized."""
        text = "Line 1\n\n\n\n\nLine 2"
        cleaned = SERP.clean_patent_text(text)
        assert "\n\n\n\n\n" not in cleaned
        assert "Line 1\n\nLine 2" in cleaned

    def test_clean_patent_text_removes_line_numbers(self):
        """Test that line numbers are removed from text."""
        text = "Some text\n42\nMore text"
        cleaned = SERP.clean_patent_text(text)
        # Line numbers on their own line should be removed
        assert cleaned.strip() != "Some text\n42\nMore text"


class TestSectionExtraction:
    """Test patent section extraction functionality."""

    def test_extract_section_finds_abstract(self):
        """Test extraction of abstract section."""
        text = """
        PATENT DOCUMENT

        ABSTRACT
        This is the abstract text describing the invention.

        BACKGROUND
        This is background information.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract text" in result
        assert "BACKGROUND" not in result

    def test_extract_section_finds_claims(self):
        """Test extraction of claims section."""
        text = """
        SUMMARY
        Summary text here.

        What is claimed is:
        1. A method comprising steps A and B.
        2. The method of claim 1, further comprising step C.

        ABSTRACT
        Abstract text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"What is claimed is:"],
            end_patterns=[r"ABSTRACT"],
        )
        assert "1. A method comprising" in result
        assert "2. The method of claim 1" in result
        assert "ABSTRACT" not in result

    def test_extract_section_returns_empty_when_not_found(self):
        """Test that empty string is returned when section not found."""
        text = "This text has no matching patterns."
        result = SERP.extract_section(
            text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
        )
        assert result == ""

    def test_extract_section_handles_case_insensitive(self):
        """Test that section extraction is case insensitive."""
        text = """
        abstract
        This is the abstract in lowercase.

        background
        Background text.
        """
        result = SERP.extract_section(
            text,
            start_patterns=[r"ABSTRACT"],
            end_patterns=[r"BACKGROUND"],
        )
        assert "This is the abstract in lowercase" in result