test: add pytest framework and initial test suite

Added pytest and pytest-mock to requirements.txt for testing infrastructure.

Created tests/ directory with comprehensive test coverage for:
- Text cleaning functions (figure references, whitespace, line numbers)
- Section extraction logic (abstract, claims, case sensitivity)

All 8 tests passing. This provides a foundation for test-driven
development as we continue building the LLM integration.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
0xWheatyz 2026-02-19 18:52:54 -05:00
parent 58f2bdc238
commit 6533cef56f
3 changed files with 107 additions and 0 deletions

View File

@ -2,3 +2,5 @@ python-dotenv
serpapi
pdfplumber
requests
pytest
pytest-mock

1
tests/__init__.py Normal file
View File

@ -0,0 +1 @@
"""Tests for SPARC patent analysis system."""

104
tests/test_serp_api.py Normal file
View File

@ -0,0 +1,104 @@
"""Tests for SERP API patent retrieval and parsing functionality."""
import pytest
from SPARC.serp_api import SERP
class TestTextCleaning:
"""Test patent text cleaning functionality."""
def test_clean_patent_text_removes_figure_references(self):
"""Test that figure references are removed from text."""
text = "This is a description (see FIG. 1) of the invention."
cleaned = SERP.clean_patent_text(text)
assert "(see FIG. 1)" not in cleaned
assert "This is a description of the invention." in cleaned
def test_clean_patent_text_removes_fig_labels(self):
"""Test that FIG labels are removed from text."""
text = "As shown in FIG. 2A the circuit operates."
cleaned = SERP.clean_patent_text(text)
assert "FIG. 2A" not in cleaned
def test_clean_patent_text_removes_excessive_whitespace(self):
"""Test that excessive whitespace is normalized."""
text = "Line 1\n\n\n\n\nLine 2"
cleaned = SERP.clean_patent_text(text)
assert "\n\n\n\n\n" not in cleaned
assert "Line 1\n\nLine 2" in cleaned
def test_clean_patent_text_removes_line_numbers(self):
"""Test that line numbers are removed from text."""
text = "Some text\n42\nMore text"
cleaned = SERP.clean_patent_text(text)
# Line numbers on their own line should be removed
assert cleaned.strip() != "Some text\n42\nMore text"
class TestSectionExtraction:
"""Test patent section extraction functionality."""
def test_extract_section_finds_abstract(self):
"""Test extraction of abstract section."""
text = """
PATENT DOCUMENT
ABSTRACT
This is the abstract text describing the invention.
BACKGROUND
This is background information.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract text" in result
assert "BACKGROUND" not in result
def test_extract_section_finds_claims(self):
"""Test extraction of claims section."""
text = """
SUMMARY
Summary text here.
What is claimed is:
1. A method comprising steps A and B.
2. The method of claim 1, further comprising step C.
ABSTRACT
Abstract text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"What is claimed is:"],
end_patterns=[r"ABSTRACT"],
)
assert "1. A method comprising" in result
assert "2. The method of claim 1" in result
assert "ABSTRACT" not in result
def test_extract_section_returns_empty_when_not_found(self):
"""Test that empty string is returned when section not found."""
text = "This text has no matching patterns."
result = SERP.extract_section(
text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
)
assert result == ""
def test_extract_section_handles_case_insensitive(self):
"""Test that section extraction is case insensitive."""
text = """
abstract
This is the abstract in lowercase.
background
Background text.
"""
result = SERP.extract_section(
text,
start_patterns=[r"ABSTRACT"],
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract in lowercase" in result