test: add pytest framework and initial test suite

Added pytest and pytest-mock to requirements.txt for testing infrastructure. Created tests/ directory with comprehensive test coverage for: - Text cleaning functions (figure references, whitespace, line numbers) - Section extraction logic (abstract, claims, case sensitivity) All 8 tests passing. This provides a foundation for test-driven development as we continue building the LLM integration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 18:52:54 -05:00
parent 58f2bdc238
commit 6533cef56f
3 changed files with 107 additions and 0 deletions
@@ -2,3 +2,5 @@ python-dotenv
 serpapi
 pdfplumber
 requests
+pytest
+pytest-mock
@@ -0,0 +1 @@
+"""Tests for SPARC patent analysis system."""
@@ -0,0 +1,104 @@
+"""Tests for SERP API patent retrieval and parsing functionality."""
+
+import pytest
+from SPARC.serp_api import SERP
+
+
+class TestTextCleaning:
+    """Test patent text cleaning functionality."""
+
+    def test_clean_patent_text_removes_figure_references(self):
+        """Test that figure references are removed from text."""
+        text = "This is a description (see FIG. 1) of the invention."
+        cleaned = SERP.clean_patent_text(text)
+        assert "(see FIG. 1)" not in cleaned
+        assert "This is a description  of the invention." in cleaned
+
+    def test_clean_patent_text_removes_fig_labels(self):
+        """Test that FIG labels are removed from text."""
+        text = "As shown in FIG. 2A the circuit operates."
+        cleaned = SERP.clean_patent_text(text)
+        assert "FIG. 2A" not in cleaned
+
+    def test_clean_patent_text_removes_excessive_whitespace(self):
+        """Test that excessive whitespace is normalized."""
+        text = "Line 1\n\n\n\n\nLine 2"
+        cleaned = SERP.clean_patent_text(text)
+        assert "\n\n\n\n\n" not in cleaned
+        assert "Line 1\n\nLine 2" in cleaned
+
+    def test_clean_patent_text_removes_line_numbers(self):
+        """Test that line numbers are removed from text."""
+        text = "Some text\n42\nMore text"
+        cleaned = SERP.clean_patent_text(text)
+        # Line numbers on their own line should be removed
+        assert cleaned.strip() != "Some text\n42\nMore text"
+
+
+class TestSectionExtraction:
+    """Test patent section extraction functionality."""
+
+    def test_extract_section_finds_abstract(self):
+        """Test extraction of abstract section."""
+        text = """
+        PATENT DOCUMENT
+
+        ABSTRACT
+        This is the abstract text describing the invention.
+
+        BACKGROUND
+        This is background information.
+        """
+        result = SERP.extract_section(
+            text,
+            start_patterns=[r"ABSTRACT"],
+            end_patterns=[r"BACKGROUND"],
+        )
+        assert "This is the abstract text" in result
+        assert "BACKGROUND" not in result
+
+    def test_extract_section_finds_claims(self):
+        """Test extraction of claims section."""
+        text = """
+        SUMMARY
+        Summary text here.
+
+        What is claimed is:
+        1. A method comprising steps A and B.
+        2. The method of claim 1, further comprising step C.
+
+        ABSTRACT
+        Abstract text.
+        """
+        result = SERP.extract_section(
+            text,
+            start_patterns=[r"What is claimed is:"],
+            end_patterns=[r"ABSTRACT"],
+        )
+        assert "1. A method comprising" in result
+        assert "2. The method of claim 1" in result
+        assert "ABSTRACT" not in result
+
+    def test_extract_section_returns_empty_when_not_found(self):
+        """Test that empty string is returned when section not found."""
+        text = "This text has no matching patterns."
+        result = SERP.extract_section(
+            text, start_patterns=[r"ABSTRACT"], end_patterns=[r"BACKGROUND"]
+        )
+        assert result == ""
+
+    def test_extract_section_handles_case_insensitive(self):
+        """Test that section extraction is case insensitive."""
+        text = """
+        abstract
+        This is the abstract in lowercase.
+
+        background
+        Background text.
+        """
+        result = SERP.extract_section(
+            text,
+            start_patterns=[r"ABSTRACT"],
+            end_patterns=[r"BACKGROUND"],
+        )
+        assert "This is the abstract in lowercase" in result
				`@@ -0,0 +1 @@`
				`"""Tests for SPARC patent analysis system."""`