feat: add S3/MinIO object storage support for patent PDFs

Introduce a StorageBackend abstraction (local filesystem and S3) for
patent PDF storage. When STORAGE_BACKEND=s3, PDFs are read/written via
boto3 to an S3-compatible bucket instead of the local filesystem.

- Add SPARC/storage.py with LocalStorageBackend and S3StorageBackend
- Update serp_api.py save_patents and parse_patent_pdf to use storage
- Add storage config vars to config.py and .env.example
- Add optional MinIO service to docker-compose.yml (--profile s3)
- Add boto3 to requirements.txt

Closes leeworks-agents/SPARC#38

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
agent-company
2026-03-26 10:17:24 +00:00
parent 55c131cb32
commit 9a43f85259
6 changed files with 258 additions and 12 deletions
+12
View File
@@ -35,6 +35,18 @@ JWT_SECRET=your-secure-jwt-secret-change-in-production
# Defaults to http://localhost:3000,http://localhost:5173 when unset
# CORS_ORIGINS=https://sparc.example.com,https://app.example.com
# ---- Storage ----
# Backend for patent PDF storage: "local" (default) or "s3"
STORAGE_BACKEND=local
# S3/MinIO settings (only used when STORAGE_BACKEND=s3)
# S3_BUCKET=sparc-patents
# S3_ENDPOINT_URL=http://localhost:9000
# AWS_ACCESS_KEY_ID=minioadmin
# AWS_SECRET_ACCESS_KEY=minioadmin
# To start MinIO locally: docker compose --profile s3 up -d minio
# ---- Cache ----
# When USE_CACHE=true: check database for cached responses before making API calls
+7
View File
@@ -53,6 +53,13 @@ root_path = os.getenv("ROOT_PATH", "")
# Used for safety checks (e.g., refusing default JWT secret in production)
app_env = os.getenv("APP_ENV", "development")
# Storage backend: "local" (default) or "s3" for S3/MinIO object storage
storage_backend = os.getenv("STORAGE_BACKEND", "local")
s3_bucket = os.getenv("S3_BUCKET", "sparc-patents")
s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "")
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID", "")
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "")
# CORS allowed origins (comma-separated)
# Defaults to localhost dev origins when unset
_cors_origins_raw = os.getenv("CORS_ORIGINS", "")
+43 -12
View File
@@ -1,4 +1,5 @@
import os
import io
import logging
import re
from datetime import datetime, timedelta
from typing import Dict
@@ -8,8 +9,21 @@ import requests
import serpapi
from SPARC import config
from SPARC.storage import StorageBackend, get_storage_backend
from SPARC.types import Patent, Patents
logger = logging.getLogger(__name__)
# Module-level storage instance (lazy-initialized)
_storage: StorageBackend | None = None
def _get_storage() -> StorageBackend:
global _storage
if _storage is None:
_storage = get_storage_backend()
return _storage
class SERP:
def query(company: str, days_back: int = None) -> Patents:
@@ -57,8 +71,9 @@ class SERP:
return Patents(patents=patent_ids)
def save_patents(patent: Patent) -> Patent:
"""
Save the patent PDF to the patents folder, skipping download if already cached.
"""Save the patent PDF to storage, skipping download if already cached.
Uses the configured storage backend (local filesystem or S3).
Args:
patent: Patent object
@@ -66,35 +81,51 @@ class SERP:
Returns:
Patent object with updated PDF path
"""
pdf_path = f"patents/{patent.patent_id}.pdf"
os.makedirs("patents", exist_ok=True)
storage = _get_storage()
key = f"{patent.patent_id}.pdf"
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
if not storage.exists(key):
logger.info("Downloading PDF for %s", patent.patent_id)
response = requests.get(patent.pdf_link)
with open(pdf_path, "wb") as f:
f.write(response.content)
storage.write(key, response.content)
logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id)
else:
logger.debug("Using cached PDF for %s", patent.patent_id)
patent.pdf_path = pdf_path
patent.pdf_path = storage.path_for(key)
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF.
Extracts all major sections from a patent PDF including abstract,
claims, summary, and detailed description.
claims, summary, and detailed description. Supports both local file
paths and S3 URIs (s3://bucket/key).
Args:
pdf_path: Path to the patent PDF file
pdf_path: Local path or S3 URI to the patent PDF file
Returns:
Dictionary containing all extracted sections
"""
logger.debug("Parsing patent PDF: %s", pdf_path)
with pdfplumber.open(pdf_path) as pdf:
if pdf_path.startswith("s3://"):
# Read from S3 via storage backend
storage = _get_storage()
# Extract key from "s3://bucket/key"
key = pdf_path.split("/", 3)[-1]
data = storage.read(key)
pdf_file: io.BytesIO | str = io.BytesIO(data)
else:
pdf_file = pdf_path
with pdfplumber.open(pdf_file) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
logger.debug("Extracted text from %d pages (%d chars)", len(pdf.pages), len(full_text))
# Define section patterns (common in patents)
sections = {
+171
View File
@@ -0,0 +1,171 @@
"""Patent PDF storage abstraction.
Provides a unified interface for reading and writing patent PDF files,
with pluggable backends for local filesystem and S3-compatible object
storage (e.g., MinIO, AWS S3).
"""
import logging
import os
from abc import ABC, abstractmethod
from SPARC import config
logger = logging.getLogger(__name__)
class StorageBackend(ABC):
"""Abstract base class for patent PDF storage."""
@abstractmethod
def read(self, key: str) -> bytes:
"""Read a file by key.
Args:
key: Storage key (e.g., "US-12345678-B2.pdf")
Returns:
File contents as bytes.
Raises:
FileNotFoundError: If the file does not exist.
"""
@abstractmethod
def write(self, key: str, data: bytes) -> None:
"""Write data to storage.
Args:
key: Storage key (e.g., "US-12345678-B2.pdf")
data: File contents as bytes.
"""
@abstractmethod
def exists(self, key: str) -> bool:
"""Check if a file exists in storage.
Args:
key: Storage key.
Returns:
True if the file exists and has non-zero size.
"""
@abstractmethod
def path_for(self, key: str) -> str:
"""Return a path or URI suitable for downstream consumers.
For local storage this is a filesystem path; for S3 it is the
object key (callers that need a local file should use read()
and write to a temporary location).
"""
class LocalStorageBackend(StorageBackend):
"""Store patent PDFs on the local filesystem under a directory."""
def __init__(self, base_dir: str = "patents"):
self.base_dir = base_dir
os.makedirs(self.base_dir, exist_ok=True)
def _full_path(self, key: str) -> str:
return os.path.join(self.base_dir, key)
def read(self, key: str) -> bytes:
path = self._full_path(key)
if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")
with open(path, "rb") as f:
return f.read()
def write(self, key: str, data: bytes) -> None:
path = self._full_path(key)
os.makedirs(os.path.dirname(path) or self.base_dir, exist_ok=True)
with open(path, "wb") as f:
f.write(data)
logger.debug("Wrote %d bytes to %s", len(data), path)
def exists(self, key: str) -> bool:
path = self._full_path(key)
return os.path.exists(path) and os.path.getsize(path) > 0
def path_for(self, key: str) -> str:
return self._full_path(key)
class S3StorageBackend(StorageBackend):
"""Store patent PDFs in an S3-compatible bucket."""
def __init__(
self,
bucket: str,
endpoint_url: str = "",
access_key: str = "",
secret_key: str = "",
):
import boto3
kwargs: dict = {}
if endpoint_url:
kwargs["endpoint_url"] = endpoint_url
if access_key and secret_key:
kwargs["aws_access_key_id"] = access_key
kwargs["aws_secret_access_key"] = secret_key
self.s3 = boto3.client("s3", **kwargs)
self.bucket = bucket
# Ensure bucket exists (useful for MinIO local dev)
try:
self.s3.head_bucket(Bucket=self.bucket)
except Exception:
try:
self.s3.create_bucket(Bucket=self.bucket)
logger.info("Created S3 bucket: %s", self.bucket)
except Exception as e:
logger.warning("Could not create bucket %s: %s", self.bucket, e)
def read(self, key: str) -> bytes:
try:
response = self.s3.get_object(Bucket=self.bucket, Key=key)
return response["Body"].read()
except self.s3.exceptions.NoSuchKey:
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
except Exception as e:
if "NoSuchKey" in str(e) or "404" in str(e):
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
raise
def write(self, key: str, data: bytes) -> None:
self.s3.put_object(
Bucket=self.bucket,
Key=key,
Body=data,
ContentType="application/pdf",
)
logger.debug("Wrote %d bytes to s3://%s/%s", len(data), self.bucket, key)
def exists(self, key: str) -> bool:
try:
response = self.s3.head_object(Bucket=self.bucket, Key=key)
return response["ContentLength"] > 0
except Exception:
return False
def path_for(self, key: str) -> str:
return f"s3://{self.bucket}/{key}"
def get_storage_backend() -> StorageBackend:
"""Factory: return the configured storage backend instance."""
backend = config.storage_backend.lower()
if backend == "s3":
logger.info("Using S3 storage backend (bucket=%s)", config.s3_bucket)
return S3StorageBackend(
bucket=config.s3_bucket,
endpoint_url=config.s3_endpoint_url,
access_key=config.s3_access_key,
secret_key=config.s3_secret_key,
)
logger.info("Using local storage backend")
return LocalStorageBackend()
+24
View File
@@ -52,6 +52,29 @@ services:
- ./patents:/app/patents
restart: unless-stopped
# Optional: MinIO for S3-compatible local object storage
# Enable by setting STORAGE_BACKEND=s3 in .env
minio:
image: minio/minio:latest
container_name: sparc-minio
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID:-minioadmin}
MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY:-minioadmin}
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio_data:/data
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 10s
timeout: 5s
retries: 3
restart: unless-stopped
profiles:
- s3
dashboard:
build: ./frontend
container_name: sparc-dashboard
@@ -63,3 +86,4 @@ services:
volumes:
postgres_data:
minio_data:
+1
View File
@@ -15,3 +15,4 @@ pandas
bcrypt
PyJWT
slowapi
boto3