feat: patent retrival and semi-processed

This commit is contained in:
2025-12-08 19:33:02 -05:00
parent b51f0596a3
commit 63a9889e5b
21 changed files with 301 additions and 54 deletions
-1
View File
@@ -1 +0,0 @@
Get us some oreos!
+1 -4
View File
@@ -1,6 +1,3 @@
from .types import Patents, Patent
all = [
"Patents",
"Patent"
]
all = ["Patents", "Patent"]
-1
View File
@@ -4,4 +4,3 @@ import os
load_dotenv()
api_key = os.getenv("API_KEY")
+81
View File
@@ -0,0 +1,81 @@
import re
import pdfplumber # pip install pdfplumber
from typing import Dict
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
"abstract": extract_section(
full_text,
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
),
"claims": extract_section(
full_text,
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
),
"summary": extract_section(
full_text,
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
),
"description": extract_section(
full_text,
start_patterns=[
r"DETAILED DESCRIPTION",
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
],
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
),
}
return sections
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r"\n\s*\n", "\n\n", text)
# Remove figure references
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
# Remove line numbers (common in PDFs)
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
return text.strip()
+99 -3
View File
@@ -1,8 +1,13 @@
import serpapi
from SPARC import config
import re
import pdfplumber # pip install pdfplumber
import requests
from typing import Dict
from SPARC.types import Patents, Patent
class SERP:
def query(company: str):
def query(company: str) -> Patents:
# Make API call
params = {
"engine": "google_patents",
@@ -17,6 +22,97 @@ class SERP:
patent_ids = []
list_of_patents = search["organic_results"]
for patent in list_of_patents:
patent_ids.append(patent["publication_number"])
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
return Patents(patents=patent_ids)
def save_patents(patent: Patent) -> Patent:
"""
Save the patent PDF to the patents folder
Args:
patent: Patent object
Returns:
Patent object with updated PDF path
"""
response = requests.get(patent.pdf_link)
print(patent.pdf_link)
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
f.write(response.content)
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
'abstract': SERP.extract_section(
full_text,
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
),
'claims': SERP.extract_section(
full_text,
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
),
'summary': SERP.extract_section(
full_text,
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
),
'description': SERP.extract_section(
full_text,
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
)
}
return sections
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return SERP.clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
# Remove figure references
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
# Remove line numbers (common in PDFs)
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
return text.strip()
return patent_ids
+8 -5
View File
@@ -1,11 +1,14 @@
from dataclass import dataclass
from dataclasses import dataclass
@dataclass
class Patent:
patent_id: int
pdf_link: str
summary: dict
patent_id: int
pdf_link: str
pdf_path: str | None = None
summary: dict | None = None
@dataclass
class Patents:
patents: list[Patent]
patents: list[Patent]