Files
SPARC/tests/test_serp_api.py
T
agent-company fbb72fe2a5 ci: add pytest and ruff linting to CI, fix all lint errors
- Add test job to build.yaml that runs pytest and ruff before building images
- Add standalone test.yaml workflow for PRs
- Add ruff.toml with E/F/I rules configured
- Fix all ruff lint errors: sort imports, remove unused imports, fix re-exports
- Build jobs now depend on test job passing (needs: test)

Closes leeworks-agents/SPARC#18
Closes leeworks-agents/SPARC#19

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 07:04:00 +00:00

268 lines
10 KiB
Python

"""Tests for SERP API patent retrieval and parsing functionality."""
from datetime import datetime, timedelta
from unittest.mock import Mock
from SPARC.serp_api import SERP
from SPARC.types import Patent
class TestTextCleaning:
"""Test patent text cleaning functionality."""
def test_clean_patent_text_removes_figure_references(self):
"""Test that figure references are removed from text."""
text = "This is a description (see FIG. 1) of the invention."
cleaned = SERP.clean_patent_text(text)
assert "(see FIG. 1)" not in cleaned
assert "This is a description of the invention." in cleaned
def test_clean_patent_text_removes_fig_labels(self):
"""Test that FIG labels are removed from text."""
text = "As shown in FIG. 2A the circuit operates."
cleaned = SERP.clean_patent_text(text)
assert "FIG. 2A" not in cleaned
def test_clean_patent_text_removes_excessive_whitespace(self):
"""Test that excessive whitespace is normalized."""
text = "Line 1\n\n\n\n\nLine 2"
cleaned = SERP.clean_patent_text(text)
assert "\n\n\n\n\n" not in cleaned
assert "Line 1\n\nLine 2" in cleaned
def test_clean_patent_text_removes_line_numbers(self):
"""Test that line numbers are removed from text."""
text = "Some text\n42\nMore text"
cleaned = SERP.clean_patent_text(text)
# Line numbers on their own line should be removed
assert cleaned.strip() != "Some text\n42\nMore text"
class TestSectionExtraction:
"""Test patent section extraction functionality."""
def test_extract_section_finds_abstract(self):
"""Test extraction of abstract section."""
text = """
PATENT DOCUMENT
ABSTRACT
This is the abstract text describing the invention.
BACKGROUND
This is background information.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract text" in result
assert "BACKGROUND" not in result
def test_extract_section_finds_claims(self):
"""Test extraction of claims section."""
text = """
SUMMARY
Summary text here.
What is claimed is:
1. A method comprising steps A and B.
2. The method of claim 1, further comprising step C.
ABSTRACT
Abstract text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"What is claimed is:"],
end_patterns=[r"ABSTRACT"],
)
assert "1. A method comprising" in result
assert "2. The method of claim 1" in result
assert "ABSTRACT" not in result
def test_extract_section_returns_empty_when_not_found(self):
"""Test that empty string is returned when section not found."""
text = "This text has no matching patterns."
result = SERP.extract_section(
text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
)
assert result == ""
def test_extract_section_handles_case_insensitive(self):
"""Test that section extraction is case insensitive."""
text = """
abstract
This is the abstract in lowercase.
background
Background text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract in lowercase" in result
class TestPatentMinimization:
"""Test patent content minimization for LLM consumption."""
def test_minimize_includes_all_essential_sections(self):
"""Test that all essential sections are included in minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description...",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
assert "CLAIMS:" in result
assert "1. A method for doing X." in result
assert "SUMMARY:" in result
assert "This invention relates to X." in result
def test_minimize_excludes_description(self):
"""Test that detailed description is excluded from minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description that should be excluded.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "Very long detailed description" not in result
assert "DESCRIPTION:" not in result
def test_minimize_handles_missing_sections(self):
"""Test that minimization handles missing sections gracefully."""
sections = {
"abstract": "This is the abstract.",
# claims missing
# summary missing
"description": "Description text.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
# Should not error on missing sections
assert isinstance(result, str)
def test_minimize_with_empty_sections(self):
"""Test that empty sections are handled properly."""
sections = {
"abstract": "",
"claims": "1. A method.",
"summary": "",
}
result = SERP.minimize_patent_for_llm(sections)
# Empty sections should not appear
assert result.count("CLAIMS:") == 1
assert "1. A method." in result
def test_minimize_separates_sections_with_double_newline(self):
"""Test that sections are properly separated."""
sections = {
"abstract": "Abstract text.",
"claims": "Claims text.",
"summary": "Summary text.",
}
result = SERP.minimize_patent_for_llm(sections)
# Sections should be separated by double newlines
assert "\n\n" in result
class TestDynamicDateRange:
"""Test dynamic date range computation in SERP.query."""
def test_query_uses_rolling_date_window(self, mocker):
"""Verify the date filter uses a rolling window, not hardcoded dates."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
SERP.query("TestCorp")
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
assert "cdr:1,cd_min:" in tbs
assert "10/28/2025" not in tbs # old hardcoded date gone
def test_query_respects_days_back_param(self, mocker):
"""Verify days_back parameter controls the date window."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
now = datetime.now()
SERP.query("TestCorp", days_back=30)
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
assert expected_start in tbs
class TestFilesystemPDFCaching:
"""Test that save_patents skips download for existing files."""
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
"""Already-downloaded PDFs should not be re-downloaded."""
mock_get = mocker.patch("SPARC.serp_api.requests.get")
mocker.patch("SPARC.serp_api.os.makedirs")
pdf_path = tmp_path / "US123.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_not_called()
assert result.pdf_path == "patents/US123.pdf"
def test_save_patents_downloads_when_not_cached(self, mocker):
"""Missing PDFs should be downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once_with("http://example.com/test.pdf")
assert result.pdf_path == "patents/US456.pdf"
def test_save_patents_redownloads_empty_files(self, mocker):
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once()
assert result.pdf_path == "patents/US789.pdf"