feat: patent retrival and semi-processed

This commit is contained in:
0xWheatyz 2025-12-08 19:33:02 -05:00
parent b51f0596a3
commit 63a9889e5b
21 changed files with 301 additions and 54 deletions

View File

@ -9,10 +9,9 @@ Semiconductor Patent & Analytics Report Core
Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
## Usage
Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
## Support
Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
```bash
docker compose up -d
```
## Roadmap
- [ ] Retrive `publicationID` from SERP API

View File

@ -1 +0,0 @@
Get us some oreos!

View File

@ -1,6 +1,3 @@
from .types import Patents, Patent
all = [
"Patents",
"Patent"
]
all = ["Patents", "Patent"]

View File

@ -4,4 +4,3 @@ import os
load_dotenv()
api_key = os.getenv("API_KEY")

View File

@ -0,0 +1,81 @@
import re
import pdfplumber # pip install pdfplumber
from typing import Dict
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
"abstract": extract_section(
full_text,
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
),
"claims": extract_section(
full_text,
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
),
"summary": extract_section(
full_text,
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
),
"description": extract_section(
full_text,
start_patterns=[
r"DETAILED DESCRIPTION",
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
],
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
),
}
return sections
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r"\n\s*\n", "\n\n", text)
# Remove figure references
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
# Remove line numbers (common in PDFs)
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
return text.strip()

View File

@ -1,8 +1,13 @@
import serpapi
from SPARC import config
import re
import pdfplumber # pip install pdfplumber
import requests
from typing import Dict
from SPARC.types import Patents, Patent
class SERP:
def query(company: str):
def query(company: str) -> Patents:
# Make API call
params = {
"engine": "google_patents",
@ -17,6 +22,97 @@ class SERP:
patent_ids = []
list_of_patents = search["organic_results"]
for patent in list_of_patents:
patent_ids.append(patent["publication_number"])
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
return Patents(patents=patent_ids)
def save_patents(patent: Patent) -> Patent:
"""
Save the patent PDF to the patents folder
Args:
patent: Patent object
Returns:
Patent object with updated PDF path
"""
response = requests.get(patent.pdf_link)
print(patent.pdf_link)
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
f.write(response.content)
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
'abstract': SERP.extract_section(
full_text,
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
),
'claims': SERP.extract_section(
full_text,
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
),
'summary': SERP.extract_section(
full_text,
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
),
'description': SERP.extract_section(
full_text,
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
)
}
return sections
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return SERP.clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
# Remove figure references
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
# Remove line numbers (common in PDFs)
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
return text.strip()
return patent_ids

View File

@ -1,10 +1,13 @@
from dataclass import dataclass
from dataclasses import dataclass
@dataclass
class Patent:
patent_id: int
pdf_link: str
summary: dict
pdf_path: str | None = None
summary: dict | None = None
@dataclass
class Patents:

61
flake.lock generated Normal file
View File

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1764831616,
"narHash": "sha256-OtzF5wBvO0jgW1WW1rQU9cMGx7zuvkF7CAVJ1ypzkxA=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "c97c47f2bac4fa59e2cbdeba289686ae615f8ed4",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.11",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

View File

@ -1,9 +1,8 @@
# flake.nix
{
description = "A Python development environment";
description = "Python dev env (NixOS 25.11) using project-local venv";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11"; # Pin to the nixos-25.11 channel
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
flake-utils.url = "github:numtide/flake-utils";
};
@ -12,39 +11,45 @@
let
pkgs = nixpkgs.legacyPackages.${system};
# Define Python version and packages
pythonVersion = "python311"; # You can change this to python310, python312, etc.
pythonEnv = pkgs.${pythonVersion}; # Get the specific Python environment
# Pick your Python version here
python = pkgs.python311; # or python312, etc.
in {
devShells.default = pkgs.mkShell {
name = "python-venv-shell";
pythonPackages = ps: with ps; [
# Core Python
#pkgs.${pythonVersion}
pythonEnv.pkgs.pip
pythonEnv.pkgs.setuptools
pythonEnv.pkgs.wheel
# Example useful packages (uncomment and add as needed)
# black # Code formatter
# ruff # Fast Python linter
# mypy # Static type checker
# ipython # Enhanced interactive Python shell
# venvwrapper # Virtual environment manager (can be useful even with Nix)
packages = [
python
pkgs.python311Packages.virtualenv # gives `virtualenv` tool
];
in
{
devShells.default = pkgs.mkShell {
name = "python-dev-shell";
packages = pythonPackages pkgs;
# Environment variables you might want to set
shellHook = ''
echo "Welcome to the Python development shell!"
export NIX_PROJECT_SHELL="SPARC"
echo "== Nix dev shell (Python $(python --version 2>&1)) =="
# Create a venv in .venv if it doesn't exist yet
if [ ! -d ".venv" ]; then
echo "Creating local virtualenv in .venv ..."
python -m venv .venv
fi
# Activate the venv
echo "Activating .venv"
. .venv/bin/activate
# Tell you what Python/pip you're using
echo "Using python: $(which python)"
echo "Using pip: $(which pip)"
# Install / update deps from requirements.txt *into .venv*
if [ -f "requirements.txt" ]; then
echo "Installing dependencies from requirements.txt into .venv ..."
pip install -r requirements.txt
else
echo "No requirements.txt found in $(pwd)"
fi
# Prompt tweak so you can see when venv is active
export PS1="(SPARC-venv) $PS1"
'';
};
}
);
});
}

11
main.py
View File

@ -1,5 +1,10 @@
import SPARC.serp_api
from SPARC.serp_api import SERP
a = SPARC.serp_api.SERP.query('nvidia')
patents = SERP.query("nvidia")
print(a)
for patent in patents.patents:
patent = SERP.save_patents(patent)
patent.summary = SERP.parse_patent_pdf(patent.pdf_path)
print(patent.summary)
print(patents)

BIN
patents/CN110888743B.pdf Normal file

Binary file not shown.

BIN
patents/CN113469073B.pdf Normal file

Binary file not shown.

BIN
patents/EP3707572B1.pdf Normal file

Binary file not shown.

BIN
patents/EP3809673B1.pdf Normal file

Binary file not shown.

BIN
patents/US11322171B1.pdf Normal file

Binary file not shown.

BIN
patents/US11775335B2.pdf Normal file

Binary file not shown.

BIN
patents/US11874663B2.pdf Normal file

Binary file not shown.

BIN
patents/US11966673B2.pdf Normal file

Binary file not shown.

BIN
patents/US12182694B2.pdf Normal file

Binary file not shown.

BIN
patents/US20220122001A1.pdf Normal file

Binary file not shown.

View File

@ -1,2 +1,4 @@
dotenv
python-dotenv
serpapi
pdfplumber
requests