forked from 0xWheatyz/SPARC
fbb72fe2a5
- Add test job to build.yaml that runs pytest and ruff before building images - Add standalone test.yaml workflow for PRs - Add ruff.toml with E/F/I rules configured - Fix all ruff lint errors: sort imports, remove unused imports, fix re-exports - Build jobs now depend on test job passing (needs: test) Closes leeworks-agents/SPARC#18 Closes leeworks-agents/SPARC#19 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
268 lines
10 KiB
Python
268 lines
10 KiB
Python
"""Tests for SERP API patent retrieval and parsing functionality."""
|
|
|
|
from datetime import datetime, timedelta
|
|
from unittest.mock import Mock
|
|
|
|
from SPARC.serp_api import SERP
|
|
from SPARC.types import Patent
|
|
|
|
|
|
class TestTextCleaning:
|
|
"""Test patent text cleaning functionality."""
|
|
|
|
def test_clean_patent_text_removes_figure_references(self):
|
|
"""Test that figure references are removed from text."""
|
|
text = "This is a description (see FIG. 1) of the invention."
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "(see FIG. 1)" not in cleaned
|
|
assert "This is a description of the invention." in cleaned
|
|
|
|
def test_clean_patent_text_removes_fig_labels(self):
|
|
"""Test that FIG labels are removed from text."""
|
|
text = "As shown in FIG. 2A the circuit operates."
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "FIG. 2A" not in cleaned
|
|
|
|
def test_clean_patent_text_removes_excessive_whitespace(self):
|
|
"""Test that excessive whitespace is normalized."""
|
|
text = "Line 1\n\n\n\n\nLine 2"
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "\n\n\n\n\n" not in cleaned
|
|
assert "Line 1\n\nLine 2" in cleaned
|
|
|
|
def test_clean_patent_text_removes_line_numbers(self):
|
|
"""Test that line numbers are removed from text."""
|
|
text = "Some text\n42\nMore text"
|
|
cleaned = SERP.clean_patent_text(text)
|
|
# Line numbers on their own line should be removed
|
|
assert cleaned.strip() != "Some text\n42\nMore text"
|
|
|
|
|
|
class TestSectionExtraction:
|
|
"""Test patent section extraction functionality."""
|
|
|
|
def test_extract_section_finds_abstract(self):
|
|
"""Test extraction of abstract section."""
|
|
text = """
|
|
PATENT DOCUMENT
|
|
|
|
ABSTRACT
|
|
This is the abstract text describing the invention.
|
|
|
|
BACKGROUND
|
|
This is background information.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"ABSTRACT"],
|
|
end_patterns=[r"BACKGROUND"],
|
|
)
|
|
assert "This is the abstract text" in result
|
|
assert "BACKGROUND" not in result
|
|
|
|
def test_extract_section_finds_claims(self):
|
|
"""Test extraction of claims section."""
|
|
text = """
|
|
SUMMARY
|
|
Summary text here.
|
|
|
|
What is claimed is:
|
|
1. A method comprising steps A and B.
|
|
2. The method of claim 1, further comprising step C.
|
|
|
|
ABSTRACT
|
|
Abstract text.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"What is claimed is:"],
|
|
end_patterns=[r"ABSTRACT"],
|
|
)
|
|
assert "1. A method comprising" in result
|
|
assert "2. The method of claim 1" in result
|
|
assert "ABSTRACT" not in result
|
|
|
|
def test_extract_section_returns_empty_when_not_found(self):
|
|
"""Test that empty string is returned when section not found."""
|
|
text = "This text has no matching patterns."
|
|
result = SERP.extract_section(
|
|
text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
|
|
)
|
|
assert result == ""
|
|
|
|
def test_extract_section_handles_case_insensitive(self):
|
|
"""Test that section extraction is case insensitive."""
|
|
text = """
|
|
abstract
|
|
This is the abstract in lowercase.
|
|
|
|
background
|
|
Background text.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"ABSTRACT"],
|
|
end_patterns=[r"BACKGROUND"],
|
|
)
|
|
assert "This is the abstract in lowercase" in result
|
|
|
|
|
|
class TestPatentMinimization:
|
|
"""Test patent content minimization for LLM consumption."""
|
|
|
|
def test_minimize_includes_all_essential_sections(self):
|
|
"""Test that all essential sections are included in minimized output."""
|
|
sections = {
|
|
"abstract": "This is the abstract.",
|
|
"claims": "1. A method for doing X.",
|
|
"summary": "This invention relates to X.",
|
|
"description": "Very long detailed description...",
|
|
}
|
|
result = SERP.minimize_patent_for_llm(sections)
|
|
|
|
assert "ABSTRACT:" in result
|
|
assert "This is the abstract." in result
|
|
assert "CLAIMS:" in result
|
|
assert "1. A method for doing X." in result
|
|
assert "SUMMARY:" in result
|
|
assert "This invention relates to X." in result
|
|
|
|
def test_minimize_excludes_description(self):
|
|
"""Test that detailed description is excluded from minimized output."""
|
|
sections = {
|
|
"abstract": "This is the abstract.",
|
|
"claims": "1. A method for doing X.",
|
|
"summary": "This invention relates to X.",
|
|
"description": "Very long detailed description that should be excluded.",
|
|
}
|
|
result = SERP.minimize_patent_for_llm(sections)
|
|
|
|
assert "Very long detailed description" not in result
|
|
assert "DESCRIPTION:" not in result
|
|
|
|
def test_minimize_handles_missing_sections(self):
|
|
"""Test that minimization handles missing sections gracefully."""
|
|
sections = {
|
|
"abstract": "This is the abstract.",
|
|
# claims missing
|
|
# summary missing
|
|
"description": "Description text.",
|
|
}
|
|
result = SERP.minimize_patent_for_llm(sections)
|
|
|
|
assert "ABSTRACT:" in result
|
|
assert "This is the abstract." in result
|
|
# Should not error on missing sections
|
|
assert isinstance(result, str)
|
|
|
|
def test_minimize_with_empty_sections(self):
|
|
"""Test that empty sections are handled properly."""
|
|
sections = {
|
|
"abstract": "",
|
|
"claims": "1. A method.",
|
|
"summary": "",
|
|
}
|
|
result = SERP.minimize_patent_for_llm(sections)
|
|
|
|
# Empty sections should not appear
|
|
assert result.count("CLAIMS:") == 1
|
|
assert "1. A method." in result
|
|
|
|
def test_minimize_separates_sections_with_double_newline(self):
|
|
"""Test that sections are properly separated."""
|
|
sections = {
|
|
"abstract": "Abstract text.",
|
|
"claims": "Claims text.",
|
|
"summary": "Summary text.",
|
|
}
|
|
result = SERP.minimize_patent_for_llm(sections)
|
|
|
|
# Sections should be separated by double newlines
|
|
assert "\n\n" in result
|
|
|
|
|
|
class TestDynamicDateRange:
|
|
"""Test dynamic date range computation in SERP.query."""
|
|
|
|
def test_query_uses_rolling_date_window(self, mocker):
|
|
"""Verify the date filter uses a rolling window, not hardcoded dates."""
|
|
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
|
mock_search.return_value = {"organic_results": []}
|
|
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
|
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
|
|
|
SERP.query("TestCorp")
|
|
|
|
call_params = mock_search.call_args[0][0]
|
|
tbs = call_params["tbs"]
|
|
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
|
|
assert "cdr:1,cd_min:" in tbs
|
|
assert "10/28/2025" not in tbs # old hardcoded date gone
|
|
|
|
def test_query_respects_days_back_param(self, mocker):
|
|
"""Verify days_back parameter controls the date window."""
|
|
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
|
mock_search.return_value = {"organic_results": []}
|
|
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
|
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
|
|
|
now = datetime.now()
|
|
SERP.query("TestCorp", days_back=30)
|
|
|
|
call_params = mock_search.call_args[0][0]
|
|
tbs = call_params["tbs"]
|
|
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
|
|
assert expected_start in tbs
|
|
|
|
|
|
class TestFilesystemPDFCaching:
|
|
"""Test that save_patents skips download for existing files."""
|
|
|
|
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
|
|
"""Already-downloaded PDFs should not be re-downloaded."""
|
|
mock_get = mocker.patch("SPARC.serp_api.requests.get")
|
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
|
|
|
pdf_path = tmp_path / "US123.pdf"
|
|
pdf_path.write_bytes(b"%PDF-1.4 fake content")
|
|
|
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
|
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
|
|
|
|
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
|
result = SERP.save_patents(patent)
|
|
|
|
mock_get.assert_not_called()
|
|
assert result.pdf_path == "patents/US123.pdf"
|
|
|
|
def test_save_patents_downloads_when_not_cached(self, mocker):
|
|
"""Missing PDFs should be downloaded."""
|
|
mock_response = Mock()
|
|
mock_response.content = b"%PDF-1.4 content"
|
|
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
|
|
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
|
|
|
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
|
|
result = SERP.save_patents(patent)
|
|
|
|
mock_get.assert_called_once_with("http://example.com/test.pdf")
|
|
assert result.pdf_path == "patents/US456.pdf"
|
|
|
|
def test_save_patents_redownloads_empty_files(self, mocker):
|
|
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
|
|
mock_response = Mock()
|
|
mock_response.content = b"%PDF-1.4 content"
|
|
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
|
mocker.patch("SPARC.serp_api.os.makedirs")
|
|
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
|
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
|
|
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
|
|
|
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
|
|
result = SERP.save_patents(patent)
|
|
|
|
mock_get.assert_called_once()
|
|
assert result.pdf_path == "patents/US789.pdf"
|