refactor: remove duplicate patent_api.py module

Removed SPARC/patent_api.py as it contained duplicate implementations
of parse_patent_pdf, extract_section, and clean_patent_text functions
that are already present in SPARC/serp_api.py as static methods.

The serp_api.py implementation is actively used in main.py, while
patent_api.py was unused legacy code.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
0xWheatyz 2026-02-19 18:49:31 -05:00
parent 607cc9e1d5
commit 58f2bdc238

View File

@ -1,81 +0,0 @@
import re
import pdfplumber # pip install pdfplumber
from typing import Dict
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
"abstract": extract_section(
full_text,
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
),
"claims": extract_section(
full_text,
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
),
"summary": extract_section(
full_text,
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
),
"description": extract_section(
full_text,
start_patterns=[
r"DETAILED DESCRIPTION",
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
],
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
),
}
return sections
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r"\n\s*\n", "\n\n", text)
# Remove figure references
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
# Remove line numbers (common in PDFs)
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
return text.strip()