SPARC/tests/test_serp_api.py
0xWheatyz 26a23c02ae feat: add patent content minimization for LLM consumption
Implemented minimize_patent_for_llm() function that reduces patent
content by keeping only essential sections (abstract, claims, summary)
and explicitly excludes the verbose detailed description section.

This reduces token usage while preserving core innovation details
needed for company performance estimation.

Added comprehensive test coverage (5 new tests) for:
- Essential section inclusion
- Description section exclusion
- Missing section handling
- Empty section handling
- Section separator formatting

All 13 tests passing.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 18:54:07 -05:00

179 lines
6.2 KiB
Python

"""Tests for SERP API patent retrieval and parsing functionality."""
import pytest
from SPARC.serp_api import SERP
class TestTextCleaning:
"""Test patent text cleaning functionality."""
def test_clean_patent_text_removes_figure_references(self):
"""Test that figure references are removed from text."""
text = "This is a description (see FIG. 1) of the invention."
cleaned = SERP.clean_patent_text(text)
assert "(see FIG. 1)" not in cleaned
assert "This is a description of the invention." in cleaned
def test_clean_patent_text_removes_fig_labels(self):
"""Test that FIG labels are removed from text."""
text = "As shown in FIG. 2A the circuit operates."
cleaned = SERP.clean_patent_text(text)
assert "FIG. 2A" not in cleaned
def test_clean_patent_text_removes_excessive_whitespace(self):
"""Test that excessive whitespace is normalized."""
text = "Line 1\n\n\n\n\nLine 2"
cleaned = SERP.clean_patent_text(text)
assert "\n\n\n\n\n" not in cleaned
assert "Line 1\n\nLine 2" in cleaned
def test_clean_patent_text_removes_line_numbers(self):
"""Test that line numbers are removed from text."""
text = "Some text\n42\nMore text"
cleaned = SERP.clean_patent_text(text)
# Line numbers on their own line should be removed
assert cleaned.strip() != "Some text\n42\nMore text"
class TestSectionExtraction:
"""Test patent section extraction functionality."""
def test_extract_section_finds_abstract(self):
"""Test extraction of abstract section."""
text = """
PATENT DOCUMENT
ABSTRACT
This is the abstract text describing the invention.
BACKGROUND
This is background information.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract text" in result
assert "BACKGROUND" not in result
def test_extract_section_finds_claims(self):
"""Test extraction of claims section."""
text = """
SUMMARY
Summary text here.
What is claimed is:
1. A method comprising steps A and B.
2. The method of claim 1, further comprising step C.
ABSTRACT
Abstract text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"What is claimed is:"],
end_patterns=[r"ABSTRACT"],
)
assert "1. A method comprising" in result
assert "2. The method of claim 1" in result
assert "ABSTRACT" not in result
def test_extract_section_returns_empty_when_not_found(self):
"""Test that empty string is returned when section not found."""
text = "This text has no matching patterns."
result = SERP.extract_section(
text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
)
assert result == ""
def test_extract_section_handles_case_insensitive(self):
"""Test that section extraction is case insensitive."""
text = """
abstract
This is the abstract in lowercase.
background
Background text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract in lowercase" in result
class TestPatentMinimization:
"""Test patent content minimization for LLM consumption."""
def test_minimize_includes_all_essential_sections(self):
"""Test that all essential sections are included in minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description...",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
assert "CLAIMS:" in result
assert "1. A method for doing X." in result
assert "SUMMARY:" in result
assert "This invention relates to X." in result
def test_minimize_excludes_description(self):
"""Test that detailed description is excluded from minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description that should be excluded.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "Very long detailed description" not in result
assert "DESCRIPTION:" not in result
def test_minimize_handles_missing_sections(self):
"""Test that minimization handles missing sections gracefully."""
sections = {
"abstract": "This is the abstract.",
# claims missing
# summary missing
"description": "Description text.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
# Should not error on missing sections
assert isinstance(result, str)
def test_minimize_with_empty_sections(self):
"""Test that empty sections are handled properly."""
sections = {
"abstract": "",
"claims": "1. A method.",
"summary": "",
}
result = SERP.minimize_patent_for_llm(sections)
# Empty sections should not appear
assert result.count("CLAIMS:") == 1
assert "1. A method." in result
def test_minimize_separates_sections_with_double_newline(self):
"""Test that sections are properly separated."""
sections = {
"abstract": "Abstract text.",
"claims": "Claims text.",
"summary": "Summary text.",
}
result = SERP.minimize_patent_for_llm(sections)
# Sections should be separated by double newlines
assert "\n\n" in result