"""Tests for SERP API patent retrieval and parsing functionality.""" import pytest from SPARC.serp_api import SERP class TestTextCleaning: """Test patent text cleaning functionality.""" def test_clean_patent_text_removes_figure_references(self): """Test that figure references are removed from text.""" text = "This is a description (see FIG. 1) of the invention." cleaned = SERP.clean_patent_text(text) assert "(see FIG. 1)" not in cleaned assert "This is a description of the invention." in cleaned def test_clean_patent_text_removes_fig_labels(self): """Test that FIG labels are removed from text.""" text = "As shown in FIG. 2A the circuit operates." cleaned = SERP.clean_patent_text(text) assert "FIG. 2A" not in cleaned def test_clean_patent_text_removes_excessive_whitespace(self): """Test that excessive whitespace is normalized.""" text = "Line 1\n\n\n\n\nLine 2" cleaned = SERP.clean_patent_text(text) assert "\n\n\n\n\n" not in cleaned assert "Line 1\n\nLine 2" in cleaned def test_clean_patent_text_removes_line_numbers(self): """Test that line numbers are removed from text.""" text = "Some text\n42\nMore text" cleaned = SERP.clean_patent_text(text) # Line numbers on their own line should be removed assert cleaned.strip() != "Some text\n42\nMore text" class TestSectionExtraction: """Test patent section extraction functionality.""" def test_extract_section_finds_abstract(self): """Test extraction of abstract section.""" text = """ PATENT DOCUMENT ABSTRACT This is the abstract text describing the invention. BACKGROUND This is background information. """ result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"], ) assert "This is the abstract text" in result assert "BACKGROUND" not in result def test_extract_section_finds_claims(self): """Test extraction of claims section.""" text = """ SUMMARY Summary text here. What is claimed is: 1. A method comprising steps A and B. 2. The method of claim 1, further comprising step C. ABSTRACT Abstract text. """ result = SERP.extract_section( text, start_patterns=[r"What is claimed is:"], end_patterns=[r"ABSTRACT"], ) assert "1. A method comprising" in result assert "2. The method of claim 1" in result assert "ABSTRACT" not in result def test_extract_section_returns_empty_when_not_found(self): """Test that empty string is returned when section not found.""" text = "This text has no matching patterns." result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"] ) assert result == "" def test_extract_section_handles_case_insensitive(self): """Test that section extraction is case insensitive.""" text = """ abstract This is the abstract in lowercase. background Background text. """ result = SERP.extract_section( text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"], ) assert "This is the abstract in lowercase" in result class TestPatentMinimization: """Test patent content minimization for LLM consumption.""" def test_minimize_includes_all_essential_sections(self): """Test that all essential sections are included in minimized output.""" sections = { "abstract": "This is the abstract.", "claims": "1. A method for doing X.", "summary": "This invention relates to X.", "description": "Very long detailed description...", } result = SERP.minimize_patent_for_llm(sections) assert "ABSTRACT:" in result assert "This is the abstract." in result assert "CLAIMS:" in result assert "1. A method for doing X." in result assert "SUMMARY:" in result assert "This invention relates to X." in result def test_minimize_excludes_description(self): """Test that detailed description is excluded from minimized output.""" sections = { "abstract": "This is the abstract.", "claims": "1. A method for doing X.", "summary": "This invention relates to X.", "description": "Very long detailed description that should be excluded.", } result = SERP.minimize_patent_for_llm(sections) assert "Very long detailed description" not in result assert "DESCRIPTION:" not in result def test_minimize_handles_missing_sections(self): """Test that minimization handles missing sections gracefully.""" sections = { "abstract": "This is the abstract.", # claims missing # summary missing "description": "Description text.", } result = SERP.minimize_patent_for_llm(sections) assert "ABSTRACT:" in result assert "This is the abstract." in result # Should not error on missing sections assert isinstance(result, str) def test_minimize_with_empty_sections(self): """Test that empty sections are handled properly.""" sections = { "abstract": "", "claims": "1. A method.", "summary": "", } result = SERP.minimize_patent_for_llm(sections) # Empty sections should not appear assert result.count("CLAIMS:") == 1 assert "1. A method." in result def test_minimize_separates_sections_with_double_newline(self): """Test that sections are properly separated.""" sections = { "abstract": "Abstract text.", "claims": "Claims text.", "summary": "Summary text.", } result = SERP.minimize_patent_for_llm(sections) # Sections should be separated by double newlines assert "\n\n" in result