refactor: remove duplicate patent_api.py module
Removed SPARC/patent_api.py as it contained duplicate implementations of parse_patent_pdf, extract_section, and clean_patent_text functions that are already present in SPARC/serp_api.py as static methods. The serp_api.py implementation is actively used in main.py, while patent_api.py was unused legacy code. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
607cc9e1d5
commit
58f2bdc238
@ -1,81 +0,0 @@
|
|||||||
import re
|
|
||||||
import pdfplumber # pip install pdfplumber
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
|
|
||||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
|
||||||
"""Extract structured sections from patent PDF"""
|
|
||||||
|
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
|
||||||
# Extract all text
|
|
||||||
full_text = ""
|
|
||||||
for page in pdf.pages:
|
|
||||||
full_text += page.extract_text() + "\n"
|
|
||||||
|
|
||||||
# Define section patterns (common in patents)
|
|
||||||
sections = {
|
|
||||||
"abstract": extract_section(
|
|
||||||
full_text,
|
|
||||||
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
|
|
||||||
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
|
|
||||||
),
|
|
||||||
"claims": extract_section(
|
|
||||||
full_text,
|
|
||||||
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
|
|
||||||
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
|
|
||||||
),
|
|
||||||
"summary": extract_section(
|
|
||||||
full_text,
|
|
||||||
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
|
|
||||||
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
|
|
||||||
),
|
|
||||||
"description": extract_section(
|
|
||||||
full_text,
|
|
||||||
start_patterns=[
|
|
||||||
r"DETAILED DESCRIPTION",
|
|
||||||
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
|
|
||||||
],
|
|
||||||
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
return sections
|
|
||||||
|
|
||||||
|
|
||||||
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
|
||||||
"""Extract text between start and end patterns"""
|
|
||||||
|
|
||||||
# Find start position
|
|
||||||
start_pos = None
|
|
||||||
for pattern in start_patterns:
|
|
||||||
match = re.search(pattern, text, re.IGNORECASE)
|
|
||||||
if match:
|
|
||||||
start_pos = match.end()
|
|
||||||
break
|
|
||||||
|
|
||||||
if start_pos is None:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Find end position
|
|
||||||
end_pos = len(text)
|
|
||||||
for pattern in end_patterns:
|
|
||||||
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
|
||||||
if match:
|
|
||||||
end_pos = start_pos + match.start()
|
|
||||||
break
|
|
||||||
|
|
||||||
# Extract and clean
|
|
||||||
section_text = text[start_pos:end_pos].strip()
|
|
||||||
return clean_patent_text(section_text)
|
|
||||||
|
|
||||||
|
|
||||||
def clean_patent_text(text: str) -> str:
|
|
||||||
"""Remove noise from extracted text"""
|
|
||||||
# Remove excessive whitespace
|
|
||||||
text = re.sub(r"\n\s*\n", "\n\n", text)
|
|
||||||
# Remove figure references
|
|
||||||
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
|
|
||||||
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
|
|
||||||
# Remove line numbers (common in PDFs)
|
|
||||||
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
|
|
||||||
return text.strip()
|
|
||||||
Loading…
Reference in New Issue
Block a user