SPARC/SPARC/patent_api.py

import re
import pdfplumber  # pip install pdfplumber
from typing import Dict


def parse_patent_pdf(pdf_path: str) -> Dict:
    """Extract structured sections from patent PDF"""

    with pdfplumber.open(pdf_path) as pdf:
        # Extract all text
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"

    # Define section patterns (common in patents)
    sections = {
        "abstract": extract_section(
            full_text,
            start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
            end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
        ),
        "claims": extract_section(
            full_text,
            start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
            end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"],  # Often at end
        ),
        "summary": extract_section(
            full_text,
            start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
            end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
        ),
        "description": extract_section(
            full_text,
            start_patterns=[
                r"DETAILED DESCRIPTION",
                r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
            ],
            end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
        ),
    }

    return sections


def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
    """Extract text between start and end patterns"""

    # Find start position
    start_pos = None
    for pattern in start_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            start_pos = match.end()
            break

    if start_pos is None:
        return ""

    # Find end position
    end_pos = len(text)
    for pattern in end_patterns:
        match = re.search(pattern, text[start_pos:], re.IGNORECASE)
        if match:
            end_pos = start_pos + match.start()
            break

    # Extract and clean
    section_text = text[start_pos:end_pos].strip()
    return clean_patent_text(section_text)


def clean_patent_text(text: str) -> str:
    """Remove noise from extracted text"""
    # Remove excessive whitespace
    text = re.sub(r"\n\s*\n", "\n\n", text)
    # Remove figure references
    text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
    text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
    # Remove line numbers (common in PDFs)
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
    return text.strip()