forked from 0xWheatyz/SPARC
6533cef56f
Added pytest and pytest-mock to requirements.txt for testing infrastructure. Created tests/ directory with comprehensive test coverage for: - Text cleaning functions (figure references, whitespace, line numbers) - Section extraction logic (abstract, claims, case sensitivity) All 8 tests passing. This provides a foundation for test-driven development as we continue building the LLM integration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
105 lines
3.5 KiB
Python
105 lines
3.5 KiB
Python
"""Tests for SERP API patent retrieval and parsing functionality."""
|
|
|
|
import pytest
|
|
from SPARC.serp_api import SERP
|
|
|
|
|
|
class TestTextCleaning:
|
|
"""Test patent text cleaning functionality."""
|
|
|
|
def test_clean_patent_text_removes_figure_references(self):
|
|
"""Test that figure references are removed from text."""
|
|
text = "This is a description (see FIG. 1) of the invention."
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "(see FIG. 1)" not in cleaned
|
|
assert "This is a description of the invention." in cleaned
|
|
|
|
def test_clean_patent_text_removes_fig_labels(self):
|
|
"""Test that FIG labels are removed from text."""
|
|
text = "As shown in FIG. 2A the circuit operates."
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "FIG. 2A" not in cleaned
|
|
|
|
def test_clean_patent_text_removes_excessive_whitespace(self):
|
|
"""Test that excessive whitespace is normalized."""
|
|
text = "Line 1\n\n\n\n\nLine 2"
|
|
cleaned = SERP.clean_patent_text(text)
|
|
assert "\n\n\n\n\n" not in cleaned
|
|
assert "Line 1\n\nLine 2" in cleaned
|
|
|
|
def test_clean_patent_text_removes_line_numbers(self):
|
|
"""Test that line numbers are removed from text."""
|
|
text = "Some text\n42\nMore text"
|
|
cleaned = SERP.clean_patent_text(text)
|
|
# Line numbers on their own line should be removed
|
|
assert cleaned.strip() != "Some text\n42\nMore text"
|
|
|
|
|
|
class TestSectionExtraction:
|
|
"""Test patent section extraction functionality."""
|
|
|
|
def test_extract_section_finds_abstract(self):
|
|
"""Test extraction of abstract section."""
|
|
text = """
|
|
PATENT DOCUMENT
|
|
|
|
ABSTRACT
|
|
This is the abstract text describing the invention.
|
|
|
|
BACKGROUND
|
|
This is background information.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"ABSTRACT"],
|
|
end_patterns=[r"BACKGROUND"],
|
|
)
|
|
assert "This is the abstract text" in result
|
|
assert "BACKGROUND" not in result
|
|
|
|
def test_extract_section_finds_claims(self):
|
|
"""Test extraction of claims section."""
|
|
text = """
|
|
SUMMARY
|
|
Summary text here.
|
|
|
|
What is claimed is:
|
|
1. A method comprising steps A and B.
|
|
2. The method of claim 1, further comprising step C.
|
|
|
|
ABSTRACT
|
|
Abstract text.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"What is claimed is:"],
|
|
end_patterns=[r"ABSTRACT"],
|
|
)
|
|
assert "1. A method comprising" in result
|
|
assert "2. The method of claim 1" in result
|
|
assert "ABSTRACT" not in result
|
|
|
|
def test_extract_section_returns_empty_when_not_found(self):
|
|
"""Test that empty string is returned when section not found."""
|
|
text = "This text has no matching patterns."
|
|
result = SERP.extract_section(
|
|
text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
|
|
)
|
|
assert result == ""
|
|
|
|
def test_extract_section_handles_case_insensitive(self):
|
|
"""Test that section extraction is case insensitive."""
|
|
text = """
|
|
abstract
|
|
This is the abstract in lowercase.
|
|
|
|
background
|
|
Background text.
|
|
"""
|
|
result = SERP.extract_section(
|
|
text,
|
|
start_patterns=[r"ABSTRACT"],
|
|
end_patterns=[r"BACKGROUND"],
|
|
)
|
|
assert "This is the abstract in lowercase" in result
|