From 58f2bdc238a0828ff4ed1eeca11de900384bf68a Mon Sep 17 00:00:00 2001 From: 0xWheatyz Date: Thu, 19 Feb 2026 18:49:31 -0500 Subject: [PATCH] refactor: remove duplicate patent_api.py module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed SPARC/patent_api.py as it contained duplicate implementations of parse_patent_pdf, extract_section, and clean_patent_text functions that are already present in SPARC/serp_api.py as static methods. The serp_api.py implementation is actively used in main.py, while patent_api.py was unused legacy code. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- SPARC/patent_api.py | 81 --------------------------------------------- 1 file changed, 81 deletions(-) delete mode 100644 SPARC/patent_api.py diff --git a/SPARC/patent_api.py b/SPARC/patent_api.py deleted file mode 100644 index 8df4e46..0000000 --- a/SPARC/patent_api.py +++ /dev/null @@ -1,81 +0,0 @@ -import re -import pdfplumber # pip install pdfplumber -from typing import Dict - - -def parse_patent_pdf(pdf_path: str) -> Dict: - """Extract structured sections from patent PDF""" - - with pdfplumber.open(pdf_path) as pdf: - # Extract all text - full_text = "" - for page in pdf.pages: - full_text += page.extract_text() + "\n" - - # Define section patterns (common in patents) - sections = { - "abstract": extract_section( - full_text, - start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"], - end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"], - ), - "claims": extract_section( - full_text, - start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"], - end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end - ), - "summary": extract_section( - full_text, - start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"], - end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"], - ), - "description": extract_section( - full_text, - start_patterns=[ - r"DETAILED DESCRIPTION", - r"DESCRIPTION OF THE PREFERRED EMBODIMENT", - ], - end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"], - ), - } - - return sections - - -def extract_section(text: str, start_patterns: list, end_patterns: list) -> str: - """Extract text between start and end patterns""" - - # Find start position - start_pos = None - for pattern in start_patterns: - match = re.search(pattern, text, re.IGNORECASE) - if match: - start_pos = match.end() - break - - if start_pos is None: - return "" - - # Find end position - end_pos = len(text) - for pattern in end_patterns: - match = re.search(pattern, text[start_pos:], re.IGNORECASE) - if match: - end_pos = start_pos + match.start() - break - - # Extract and clean - section_text = text[start_pos:end_pos].strip() - return clean_patent_text(section_text) - - -def clean_patent_text(text: str) -> str: - """Remove noise from extracted text""" - # Remove excessive whitespace - text = re.sub(r"\n\s*\n", "\n\n", text) - # Remove figure references - text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text) - text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text) - # Remove line numbers (common in PDFs) - text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE) - return text.strip()