forked from 0xWheatyz/SPARC
a6c92fde9f
Integrates S3/MinIO storage backend with structured logging changes from main. Both boto3 and apscheduler retained in requirements.txt.
228 lines
7.4 KiB
Python
228 lines
7.4 KiB
Python
import io
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict
|
|
|
|
import pdfplumber # pip install pdfplumber
|
|
import requests
|
|
import serpapi
|
|
|
|
from SPARC import config
|
|
from SPARC.storage import StorageBackend, get_storage_backend
|
|
from SPARC.types import Patent, Patents
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Module-level storage instance (lazy-initialized)
|
|
_storage: StorageBackend | None = None
|
|
|
|
|
|
def _get_storage() -> StorageBackend:
|
|
global _storage
|
|
if _storage is None:
|
|
_storage = get_storage_backend()
|
|
return _storage
|
|
|
|
|
|
class SERP:
|
|
def query(company: str, days_back: int = None) -> Patents:
|
|
"""Query Google Patents for a company's recent patents.
|
|
|
|
Args:
|
|
company: Name of the company to search for
|
|
days_back: Number of days to look back for patents (default from config)
|
|
|
|
Returns:
|
|
Patents object containing list of patents with PDF links
|
|
|
|
Note:
|
|
Patents without PDF download links are skipped. This occurs when
|
|
Google Patents doesn't have a PDF available for a particular patent
|
|
(e.g., recently filed patents, certain international patents, or
|
|
patents with restricted access). The returned count may be lower
|
|
than the requested number of results.
|
|
"""
|
|
if days_back is None:
|
|
days_back = config.patent_search_days
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days_back)
|
|
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
|
|
|
|
# Make API call
|
|
params = {
|
|
"engine": "google_patents",
|
|
"q": company,
|
|
"num": 10,
|
|
"filter": 1,
|
|
"tbs": date_filter,
|
|
"api_key": config.api_key,
|
|
}
|
|
logger.info("Querying Google Patents for '%s' (last %d days)", company, days_back)
|
|
search = serpapi.search(params)
|
|
# Convert results to Patent objects, skipping any without PDF links
|
|
patent_ids = []
|
|
list_of_patents = search["organic_results"]
|
|
for patent in list_of_patents:
|
|
pdf_link = patent.get("pdf")
|
|
if pdf_link:
|
|
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=pdf_link, summary=None))
|
|
else:
|
|
logger.debug("Skipping patent %s (no PDF link)", patent.get("publication_number", "unknown"))
|
|
|
|
logger.info("Found %d patents with PDF links for '%s'", len(patent_ids), company)
|
|
return Patents(patents=patent_ids)
|
|
|
|
def save_patents(patent: Patent) -> Patent:
|
|
"""Save the patent PDF to storage, skipping download if already cached.
|
|
|
|
Uses the configured storage backend (local filesystem or S3).
|
|
|
|
Args:
|
|
patent: Patent object
|
|
|
|
Returns:
|
|
Patent object with updated PDF path
|
|
"""
|
|
storage = _get_storage()
|
|
key = f"{patent.patent_id}.pdf"
|
|
|
|
if not storage.exists(key):
|
|
logger.info("Downloading PDF for %s", patent.patent_id)
|
|
response = requests.get(patent.pdf_link)
|
|
storage.write(key, response.content)
|
|
logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id)
|
|
else:
|
|
logger.debug("Using cached PDF for %s", patent.patent_id)
|
|
|
|
patent.pdf_path = storage.path_for(key)
|
|
return patent
|
|
|
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
|
"""Extract structured sections from patent PDF.
|
|
|
|
Extracts all major sections from a patent PDF including abstract,
|
|
claims, summary, and detailed description. Supports both local file
|
|
paths and S3 URIs (s3://bucket/key).
|
|
|
|
Args:
|
|
pdf_path: Local path or S3 URI to the patent PDF file
|
|
|
|
Returns:
|
|
Dictionary containing all extracted sections
|
|
"""
|
|
logger.debug("Parsing patent PDF: %s", pdf_path)
|
|
|
|
if pdf_path.startswith("s3://"):
|
|
# Read from S3 via storage backend
|
|
storage = _get_storage()
|
|
# Extract key from "s3://bucket/key"
|
|
key = pdf_path.split("/", 3)[-1]
|
|
data = storage.read(key)
|
|
pdf_file: io.BytesIO | str = io.BytesIO(data)
|
|
else:
|
|
pdf_file = pdf_path
|
|
|
|
with pdfplumber.open(pdf_file) as pdf:
|
|
# Extract all text
|
|
full_text = ""
|
|
for page in pdf.pages:
|
|
full_text += page.extract_text() + "\n"
|
|
logger.debug("Extracted text from %d pages (%d chars)", len(pdf.pages), len(full_text))
|
|
|
|
# Define section patterns (common in patents)
|
|
sections = {
|
|
'abstract': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
|
|
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
|
|
),
|
|
'claims': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
|
|
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
|
|
),
|
|
'summary': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
|
|
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
|
|
),
|
|
'description': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
|
|
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
|
|
)
|
|
}
|
|
|
|
return sections
|
|
|
|
def minimize_patent_for_llm(sections: Dict) -> str:
|
|
"""Minimize patent content for LLM consumption.
|
|
|
|
Removes bloated sections (detailed description) and keeps only
|
|
essential information: abstract, claims, and summary. This reduces
|
|
token usage while preserving the core innovation details.
|
|
|
|
Args:
|
|
sections: Dictionary of parsed patent sections from parse_patent_pdf()
|
|
|
|
Returns:
|
|
Concatenated string of essential patent sections ready for LLM analysis
|
|
"""
|
|
essential_parts = []
|
|
|
|
# Abstract: Concise overview of the invention
|
|
if sections.get('abstract'):
|
|
essential_parts.append("ABSTRACT:\n" + sections['abstract'])
|
|
|
|
# Claims: The actual legal claims defining the invention (most important)
|
|
if sections.get('claims'):
|
|
essential_parts.append("CLAIMS:\n" + sections['claims'])
|
|
|
|
# Summary: High-level description of the invention
|
|
if sections.get('summary'):
|
|
essential_parts.append("SUMMARY:\n" + sections['summary'])
|
|
|
|
# Explicitly exclude 'description' - it's too verbose and contains
|
|
# implementation details not needed for high-level analysis
|
|
|
|
return "\n\n".join(essential_parts)
|
|
|
|
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
|
"""Extract text between start and end patterns"""
|
|
|
|
# Find start position
|
|
start_pos = None
|
|
for pattern in start_patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
start_pos = match.end()
|
|
break
|
|
|
|
if start_pos is None:
|
|
return ""
|
|
|
|
# Find end position
|
|
end_pos = len(text)
|
|
for pattern in end_patterns:
|
|
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
|
if match:
|
|
end_pos = start_pos + match.start()
|
|
break
|
|
|
|
# Extract and clean
|
|
section_text = text[start_pos:end_pos].strip()
|
|
return SERP.clean_patent_text(section_text)
|
|
|
|
def clean_patent_text(text: str) -> str:
|
|
"""Remove noise from extracted text"""
|
|
# Remove excessive whitespace
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
# Remove figure references
|
|
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
|
|
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
|
|
# Remove line numbers (common in PDFs)
|
|
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
|
return text.strip()
|
|
|