feat: patent retrival and semi-processed
This commit is contained in:
parent
b51f0596a3
commit
63a9889e5b
@ -9,10 +9,9 @@ Semiconductor Patent & Analytics Report Core
|
|||||||
Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
|
Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
|
```bash
|
||||||
|
docker compose up -d
|
||||||
## Support
|
```
|
||||||
Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
|
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
- [ ] Retrive `publicationID` from SERP API
|
- [ ] Retrive `publicationID` from SERP API
|
||||||
|
|||||||
@ -1 +0,0 @@
|
|||||||
Get us some oreos!
|
|
||||||
@ -1,6 +1,3 @@
|
|||||||
from .types import Patents, Patent
|
from .types import Patents, Patent
|
||||||
|
|
||||||
all = [
|
all = ["Patents", "Patent"]
|
||||||
"Patents",
|
|
||||||
"Patent"
|
|
||||||
]
|
|
||||||
|
|||||||
@ -4,4 +4,3 @@ import os
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
api_key = os.getenv("API_KEY")
|
api_key = os.getenv("API_KEY")
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,81 @@
|
|||||||
|
import re
|
||||||
|
import pdfplumber # pip install pdfplumber
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||||
|
"""Extract structured sections from patent PDF"""
|
||||||
|
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
# Extract all text
|
||||||
|
full_text = ""
|
||||||
|
for page in pdf.pages:
|
||||||
|
full_text += page.extract_text() + "\n"
|
||||||
|
|
||||||
|
# Define section patterns (common in patents)
|
||||||
|
sections = {
|
||||||
|
"abstract": extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
|
||||||
|
end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
|
||||||
|
),
|
||||||
|
"claims": extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
|
||||||
|
end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"], # Often at end
|
||||||
|
),
|
||||||
|
"summary": extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
|
||||||
|
end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
|
||||||
|
),
|
||||||
|
"description": extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[
|
||||||
|
r"DETAILED DESCRIPTION",
|
||||||
|
r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
|
||||||
|
],
|
||||||
|
end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
||||||
|
"""Extract text between start and end patterns"""
|
||||||
|
|
||||||
|
# Find start position
|
||||||
|
start_pos = None
|
||||||
|
for pattern in start_patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
start_pos = match.end()
|
||||||
|
break
|
||||||
|
|
||||||
|
if start_pos is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Find end position
|
||||||
|
end_pos = len(text)
|
||||||
|
for pattern in end_patterns:
|
||||||
|
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
end_pos = start_pos + match.start()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract and clean
|
||||||
|
section_text = text[start_pos:end_pos].strip()
|
||||||
|
return clean_patent_text(section_text)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_patent_text(text: str) -> str:
|
||||||
|
"""Remove noise from extracted text"""
|
||||||
|
# Remove excessive whitespace
|
||||||
|
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||||||
|
# Remove figure references
|
||||||
|
text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
|
||||||
|
text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
|
||||||
|
# Remove line numbers (common in PDFs)
|
||||||
|
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
|
||||||
|
return text.strip()
|
||||||
@ -1,8 +1,13 @@
|
|||||||
import serpapi
|
import serpapi
|
||||||
from SPARC import config
|
from SPARC import config
|
||||||
|
import re
|
||||||
|
import pdfplumber # pip install pdfplumber
|
||||||
|
import requests
|
||||||
|
from typing import Dict
|
||||||
|
from SPARC.types import Patents, Patent
|
||||||
|
|
||||||
class SERP:
|
class SERP:
|
||||||
def query(company: str):
|
def query(company: str) -> Patents:
|
||||||
# Make API call
|
# Make API call
|
||||||
params = {
|
params = {
|
||||||
"engine": "google_patents",
|
"engine": "google_patents",
|
||||||
@ -17,6 +22,97 @@ class SERP:
|
|||||||
patent_ids = []
|
patent_ids = []
|
||||||
list_of_patents = search["organic_results"]
|
list_of_patents = search["organic_results"]
|
||||||
for patent in list_of_patents:
|
for patent in list_of_patents:
|
||||||
patent_ids.append(patent["publication_number"])
|
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
|
||||||
|
|
||||||
|
return Patents(patents=patent_ids)
|
||||||
|
|
||||||
|
def save_patents(patent: Patent) -> Patent:
|
||||||
|
"""
|
||||||
|
Save the patent PDF to the patents folder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
patent: Patent object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Patent object with updated PDF path
|
||||||
|
"""
|
||||||
|
response = requests.get(patent.pdf_link)
|
||||||
|
print(patent.pdf_link)
|
||||||
|
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
|
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||||
|
return patent
|
||||||
|
|
||||||
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||||
|
"""Extract structured sections from patent PDF"""
|
||||||
|
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
# Extract all text
|
||||||
|
full_text = ""
|
||||||
|
for page in pdf.pages:
|
||||||
|
full_text += page.extract_text() + "\n"
|
||||||
|
|
||||||
|
# Define section patterns (common in patents)
|
||||||
|
sections = {
|
||||||
|
'abstract': SERP.extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
|
||||||
|
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
|
||||||
|
),
|
||||||
|
'claims': SERP.extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
|
||||||
|
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
|
||||||
|
),
|
||||||
|
'summary': SERP.extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
|
||||||
|
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
|
||||||
|
),
|
||||||
|
'description': SERP.extract_section(
|
||||||
|
full_text,
|
||||||
|
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
|
||||||
|
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
||||||
|
"""Extract text between start and end patterns"""
|
||||||
|
|
||||||
|
# Find start position
|
||||||
|
start_pos = None
|
||||||
|
for pattern in start_patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
start_pos = match.end()
|
||||||
|
break
|
||||||
|
|
||||||
|
if start_pos is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Find end position
|
||||||
|
end_pos = len(text)
|
||||||
|
for pattern in end_patterns:
|
||||||
|
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
end_pos = start_pos + match.start()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract and clean
|
||||||
|
section_text = text[start_pos:end_pos].strip()
|
||||||
|
return SERP.clean_patent_text(section_text)
|
||||||
|
|
||||||
|
def clean_patent_text(text: str) -> str:
|
||||||
|
"""Remove noise from extracted text"""
|
||||||
|
# Remove excessive whitespace
|
||||||
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||||
|
# Remove figure references
|
||||||
|
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
|
||||||
|
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
|
||||||
|
# Remove line numbers (common in PDFs)
|
||||||
|
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
return patent_ids
|
|
||||||
|
|||||||
@ -1,11 +1,14 @@
|
|||||||
from dataclass import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Patent:
|
class Patent:
|
||||||
patent_id: int
|
patent_id: int
|
||||||
pdf_link: str
|
pdf_link: str
|
||||||
summary: dict
|
pdf_path: str | None = None
|
||||||
|
summary: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Patents:
|
class Patents:
|
||||||
patents: list[Patent]
|
patents: list[Patent]
|
||||||
|
|||||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1731533236,
|
||||||
|
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1764831616,
|
||||||
|
"narHash": "sha256-OtzF5wBvO0jgW1WW1rQU9cMGx7zuvkF7CAVJ1ypzkxA=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "c97c47f2bac4fa59e2cbdeba289686ae615f8ed4",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-25.11",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
69
flake.nix
69
flake.nix
@ -1,9 +1,8 @@
|
|||||||
# flake.nix
|
|
||||||
{
|
{
|
||||||
description = "A Python development environment";
|
description = "Python dev env (NixOS 25.11) using project-local venv";
|
||||||
|
|
||||||
inputs = {
|
inputs = {
|
||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11"; # Pin to the nixos-25.11 channel
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
|
||||||
flake-utils.url = "github:numtide/flake-utils";
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -12,39 +11,45 @@
|
|||||||
let
|
let
|
||||||
pkgs = nixpkgs.legacyPackages.${system};
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
|
||||||
# Define Python version and packages
|
# Pick your Python version here
|
||||||
pythonVersion = "python311"; # You can change this to python310, python312, etc.
|
python = pkgs.python311; # or python312, etc.
|
||||||
pythonEnv = pkgs.${pythonVersion}; # Get the specific Python environment
|
in {
|
||||||
|
|
||||||
pythonPackages = ps: with ps; [
|
|
||||||
# Core Python
|
|
||||||
#pkgs.${pythonVersion}
|
|
||||||
|
|
||||||
pythonEnv.pkgs.pip
|
|
||||||
pythonEnv.pkgs.setuptools
|
|
||||||
pythonEnv.pkgs.wheel
|
|
||||||
|
|
||||||
# Example useful packages (uncomment and add as needed)
|
|
||||||
# black # Code formatter
|
|
||||||
# ruff # Fast Python linter
|
|
||||||
# mypy # Static type checker
|
|
||||||
# ipython # Enhanced interactive Python shell
|
|
||||||
# venvwrapper # Virtual environment manager (can be useful even with Nix)
|
|
||||||
];
|
|
||||||
|
|
||||||
in
|
|
||||||
{
|
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
name = "python-dev-shell";
|
name = "python-venv-shell";
|
||||||
|
|
||||||
packages = pythonPackages pkgs;
|
packages = [
|
||||||
|
python
|
||||||
|
pkgs.python311Packages.virtualenv # gives `virtualenv` tool
|
||||||
|
];
|
||||||
|
|
||||||
# Environment variables you might want to set
|
|
||||||
shellHook = ''
|
shellHook = ''
|
||||||
echo "Welcome to the Python development shell!"
|
echo "== Nix dev shell (Python $(python --version 2>&1)) =="
|
||||||
export NIX_PROJECT_SHELL="SPARC"
|
|
||||||
|
# Create a venv in .venv if it doesn't exist yet
|
||||||
|
if [ ! -d ".venv" ]; then
|
||||||
|
echo "Creating local virtualenv in .venv ..."
|
||||||
|
python -m venv .venv
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Activate the venv
|
||||||
|
echo "Activating .venv"
|
||||||
|
. .venv/bin/activate
|
||||||
|
|
||||||
|
# Tell you what Python/pip you're using
|
||||||
|
echo "Using python: $(which python)"
|
||||||
|
echo "Using pip: $(which pip)"
|
||||||
|
|
||||||
|
# Install / update deps from requirements.txt *into .venv*
|
||||||
|
if [ -f "requirements.txt" ]; then
|
||||||
|
echo "Installing dependencies from requirements.txt into .venv ..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
else
|
||||||
|
echo "No requirements.txt found in $(pwd)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Prompt tweak so you can see when venv is active
|
||||||
|
export PS1="(SPARC-venv) $PS1"
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
}
|
});
|
||||||
);
|
|
||||||
}
|
}
|
||||||
11
main.py
11
main.py
@ -1,5 +1,10 @@
|
|||||||
import SPARC.serp_api
|
from SPARC.serp_api import SERP
|
||||||
|
|
||||||
a = SPARC.serp_api.SERP.query('nvidia')
|
patents = SERP.query("nvidia")
|
||||||
|
|
||||||
print(a)
|
for patent in patents.patents:
|
||||||
|
patent = SERP.save_patents(patent)
|
||||||
|
patent.summary = SERP.parse_patent_pdf(patent.pdf_path)
|
||||||
|
print(patent.summary)
|
||||||
|
|
||||||
|
print(patents)
|
||||||
|
|||||||
BIN
patents/CN110888743B.pdf
Normal file
BIN
patents/CN110888743B.pdf
Normal file
Binary file not shown.
BIN
patents/CN113469073B.pdf
Normal file
BIN
patents/CN113469073B.pdf
Normal file
Binary file not shown.
BIN
patents/EP3707572B1.pdf
Normal file
BIN
patents/EP3707572B1.pdf
Normal file
Binary file not shown.
BIN
patents/EP3809673B1.pdf
Normal file
BIN
patents/EP3809673B1.pdf
Normal file
Binary file not shown.
BIN
patents/US11322171B1.pdf
Normal file
BIN
patents/US11322171B1.pdf
Normal file
Binary file not shown.
BIN
patents/US11775335B2.pdf
Normal file
BIN
patents/US11775335B2.pdf
Normal file
Binary file not shown.
BIN
patents/US11874663B2.pdf
Normal file
BIN
patents/US11874663B2.pdf
Normal file
Binary file not shown.
BIN
patents/US11966673B2.pdf
Normal file
BIN
patents/US11966673B2.pdf
Normal file
Binary file not shown.
BIN
patents/US12182694B2.pdf
Normal file
BIN
patents/US12182694B2.pdf
Normal file
Binary file not shown.
BIN
patents/US20220122001A1.pdf
Normal file
BIN
patents/US20220122001A1.pdf
Normal file
Binary file not shown.
@ -1,2 +1,4 @@
|
|||||||
dotenv
|
python-dotenv
|
||||||
serpapi
|
serpapi
|
||||||
|
pdfplumber
|
||||||
|
requests
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user