forked from 0xWheatyz/SPARC
feat: patent retrival and semi-processed
This commit is contained in:
@@ -1 +0,0 @@
|
||||
Get us some oreos!
|
||||
+1
-4
@@ -1,6 +1,3 @@
|
||||
from .types import Patents, Patent
|
||||
|
||||
all = [
|
||||
"Patents",
|
||||
"Patent"
|
||||
]
|
||||
all = ["Patents", "Patent"]
|
||||
|
||||
@@ -4,4 +4,3 @@ import os
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv("API_KEY")
|
||||
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
import re
|
||||
import pdfplumber # pip install pdfplumber
|
||||
from typing import Dict
|
||||
|
||||
|
||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||
"""Extract structured sections from patent PDF"""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
# Extract all text
|
||||
full_text = ""
|
||||
for page in pdf.pages:
|
||||
full_text += page.extract_text() + "\n"
|
||||
|
||||
# Define section patterns (common in patents)
|
||||
sections = {
|
||||
"abstract": extract_section(
|
||||
full_text,
|
||||
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
|
||||
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
|
||||
),
|
||||
"claims": extract_section(
|
||||
full_text,
|
||||
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
|
||||
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
|
||||
),
|
||||
"summary": extract_section(
|
||||
full_text,
|
||||
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
|
||||
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
|
||||
),
|
||||
"description": extract_section(
|
||||
full_text,
|
||||
start_patterns=[
|
||||
r"DETAILED DESCRIPTION",
|
||||
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
|
||||
],
|
||||
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
|
||||
),
|
||||
}
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
||||
"""Extract text between start and end patterns"""
|
||||
|
||||
# Find start position
|
||||
start_pos = None
|
||||
for pattern in start_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
start_pos = match.end()
|
||||
break
|
||||
|
||||
if start_pos is None:
|
||||
return ""
|
||||
|
||||
# Find end position
|
||||
end_pos = len(text)
|
||||
for pattern in end_patterns:
|
||||
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
||||
if match:
|
||||
end_pos = start_pos + match.start()
|
||||
break
|
||||
|
||||
# Extract and clean
|
||||
section_text = text[start_pos:end_pos].strip()
|
||||
return clean_patent_text(section_text)
|
||||
|
||||
|
||||
def clean_patent_text(text: str) -> str:
|
||||
"""Remove noise from extracted text"""
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||||
# Remove figure references
|
||||
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
|
||||
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
|
||||
# Remove line numbers (common in PDFs)
|
||||
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
|
||||
return text.strip()
|
||||
|
||||
+99
-3
@@ -1,8 +1,13 @@
|
||||
import serpapi
|
||||
from SPARC import config
|
||||
import re
|
||||
import pdfplumber # pip install pdfplumber
|
||||
import requests
|
||||
from typing import Dict
|
||||
from SPARC.types import Patents, Patent
|
||||
|
||||
class SERP:
|
||||
def query(company: str):
|
||||
def query(company: str) -> Patents:
|
||||
# Make API call
|
||||
params = {
|
||||
"engine": "google_patents",
|
||||
@@ -17,6 +22,97 @@ class SERP:
|
||||
patent_ids = []
|
||||
list_of_patents = search["organic_results"]
|
||||
for patent in list_of_patents:
|
||||
patent_ids.append(patent["publication_number"])
|
||||
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
|
||||
|
||||
return Patents(patents=patent_ids)
|
||||
|
||||
def save_patents(patent: Patent) -> Patent:
|
||||
"""
|
||||
Save the patent PDF to the patents folder
|
||||
|
||||
Args:
|
||||
patent: Patent object
|
||||
|
||||
Returns:
|
||||
Patent object with updated PDF path
|
||||
"""
|
||||
response = requests.get(patent.pdf_link)
|
||||
print(patent.pdf_link)
|
||||
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
return patent
|
||||
|
||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||
"""Extract structured sections from patent PDF"""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
# Extract all text
|
||||
full_text = ""
|
||||
for page in pdf.pages:
|
||||
full_text += page.extract_text() + "\n"
|
||||
|
||||
# Define section patterns (common in patents)
|
||||
sections = {
|
||||
'abstract': SERP.extract_section(
|
||||
full_text,
|
||||
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
|
||||
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
|
||||
),
|
||||
'claims': SERP.extract_section(
|
||||
full_text,
|
||||
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
|
||||
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
|
||||
),
|
||||
'summary': SERP.extract_section(
|
||||
full_text,
|
||||
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
|
||||
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
|
||||
),
|
||||
'description': SERP.extract_section(
|
||||
full_text,
|
||||
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
|
||||
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
|
||||
)
|
||||
}
|
||||
|
||||
return sections
|
||||
|
||||
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
||||
"""Extract text between start and end patterns"""
|
||||
|
||||
# Find start position
|
||||
start_pos = None
|
||||
for pattern in start_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
start_pos = match.end()
|
||||
break
|
||||
|
||||
if start_pos is None:
|
||||
return ""
|
||||
|
||||
# Find end position
|
||||
end_pos = len(text)
|
||||
for pattern in end_patterns:
|
||||
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
||||
if match:
|
||||
end_pos = start_pos + match.start()
|
||||
break
|
||||
|
||||
# Extract and clean
|
||||
section_text = text[start_pos:end_pos].strip()
|
||||
return SERP.clean_patent_text(section_text)
|
||||
|
||||
def clean_patent_text(text: str) -> str:
|
||||
"""Remove noise from extracted text"""
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||
# Remove figure references
|
||||
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
|
||||
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
|
||||
# Remove line numbers (common in PDFs)
|
||||
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
||||
return text.strip()
|
||||
|
||||
return patent_ids
|
||||
|
||||
+8
-5
@@ -1,11 +1,14 @@
|
||||
from dataclass import dataclass
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Patent:
|
||||
patent_id: int
|
||||
pdf_link: str
|
||||
summary: dict
|
||||
patent_id: int
|
||||
pdf_link: str
|
||||
pdf_path: str | None = None
|
||||
summary: dict | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Patents:
|
||||
patents: list[Patent]
|
||||
patents: list[Patent]
|
||||
|
||||
Reference in New Issue
Block a user