refactor: remove duplicate patent_api.py module

Removed SPARC/patent_api.py as it contained duplicate implementations of parse_patent_pdf, extract_section, and clean_patent_text functions that are already present in SPARC/serp_api.py as static methods. The serp_api.py implementation is actively used in main.py, while patent_api.py was unused legacy code. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 18:49:31 -05:00
parent 607cc9e1d5
commit 58f2bdc238
1 changed files with 0 additions and 81 deletions
@@ -1,81 +0,0 @@
-import re
-import pdfplumber  # pip install pdfplumber
-from typing import Dict
-
-
-def parse_patent_pdf(pdf_path: str) -> Dict:
-    """Extract structured sections from patent PDF"""
-
-    with pdfplumber.open(pdf_path) as pdf:
-        # Extract all text
-        full_text = ""
-        for page in pdf.pages:
-            full_text += page.extract_text() + "\n"
-
-    # Define section patterns (common in patents)
-    sections = {
-        "abstract": extract_section(
-            full_text,
-            start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
-            end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
-        ),
-        "claims": extract_section(
-            full_text,
-            start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
-            end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"],  # Often at end
-        ),
-        "summary": extract_section(
-            full_text,
-            start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
-            end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
-        ),
-        "description": extract_section(
-            full_text,
-            start_patterns=[
-                r"DETAILED DESCRIPTION",
-                r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
-            ],
-            end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
-        ),
-    }
-
-    return sections
-
-
-def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
-    """Extract text between start and end patterns"""
-
-    # Find start position
-    start_pos = None
-    for pattern in start_patterns:
-        match = re.search(pattern, text, re.IGNORECASE)
-        if match:
-            start_pos = match.end()
-            break
-
-    if start_pos is None:
-        return ""
-
-    # Find end position
-    end_pos = len(text)
-    for pattern in end_patterns:
-        match = re.search(pattern, text[start_pos:], re.IGNORECASE)
-        if match:
-            end_pos = start_pos + match.start()
-            break
-
-    # Extract and clean
-    section_text = text[start_pos:end_pos].strip()
-    return clean_patent_text(section_text)
-
-
-def clean_patent_text(text: str) -> str:
-    """Remove noise from extracted text"""
-    # Remove excessive whitespace
-    text = re.sub(r"\n\s*\n", "\n\n", text)
-    # Remove figure references
-    text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
-    text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
-    # Remove line numbers (common in PDFs)
-    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
-    return text.strip()