deploy: security hardening, multi-model support, S3 storage, analytics, CI improvements (70 commits) #4
@@ -35,6 +35,18 @@ JWT_SECRET=your-secure-jwt-secret-change-in-production
|
||||
# Defaults to http://localhost:3000,http://localhost:5173 when unset
|
||||
# CORS_ORIGINS=https://sparc.example.com,https://app.example.com
|
||||
|
||||
# ---- Storage ----
|
||||
|
||||
# Backend for patent PDF storage: "local" (default) or "s3"
|
||||
STORAGE_BACKEND=local
|
||||
|
||||
# S3/MinIO settings (only used when STORAGE_BACKEND=s3)
|
||||
# S3_BUCKET=sparc-patents
|
||||
# S3_ENDPOINT_URL=http://localhost:9000
|
||||
# AWS_ACCESS_KEY_ID=minioadmin
|
||||
# AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
# To start MinIO locally: docker compose --profile s3 up -d minio
|
||||
|
||||
# ---- Cache ----
|
||||
|
||||
# When USE_CACHE=true: check database for cached responses before making API calls
|
||||
|
||||
@@ -53,6 +53,13 @@ root_path = os.getenv("ROOT_PATH", "")
|
||||
# Used for safety checks (e.g., refusing default JWT secret in production)
|
||||
app_env = os.getenv("APP_ENV", "development")
|
||||
|
||||
# Storage backend: "local" (default) or "s3" for S3/MinIO object storage
|
||||
storage_backend = os.getenv("STORAGE_BACKEND", "local")
|
||||
s3_bucket = os.getenv("S3_BUCKET", "sparc-patents")
|
||||
s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "")
|
||||
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID", "")
|
||||
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "")
|
||||
|
||||
# CORS allowed origins (comma-separated)
|
||||
# Defaults to localhost dev origins when unset
|
||||
_cors_origins_raw = os.getenv("CORS_ORIGINS", "")
|
||||
|
||||
+37
-15
@@ -1,5 +1,5 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict
|
||||
@@ -9,10 +9,21 @@ import requests
|
||||
import serpapi
|
||||
|
||||
from SPARC import config
|
||||
from SPARC.storage import StorageBackend, get_storage_backend
|
||||
from SPARC.types import Patent, Patents
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Module-level storage instance (lazy-initialized)
|
||||
_storage: StorageBackend | None = None
|
||||
|
||||
|
||||
def _get_storage() -> StorageBackend:
|
||||
global _storage
|
||||
if _storage is None:
|
||||
_storage = get_storage_backend()
|
||||
return _storage
|
||||
|
||||
|
||||
class SERP:
|
||||
def query(company: str, days_back: int = None) -> Patents:
|
||||
@@ -63,8 +74,9 @@ class SERP:
|
||||
return Patents(patents=patent_ids)
|
||||
|
||||
def save_patents(patent: Patent) -> Patent:
|
||||
"""
|
||||
Save the patent PDF to the patents folder, skipping download if already cached.
|
||||
"""Save the patent PDF to storage, skipping download if already cached.
|
||||
|
||||
Uses the configured storage backend (local filesystem or S3).
|
||||
|
||||
Args:
|
||||
patent: Patent object
|
||||
@@ -72,36 +84,46 @@ class SERP:
|
||||
Returns:
|
||||
Patent object with updated PDF path
|
||||
"""
|
||||
pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
os.makedirs("patents", exist_ok=True)
|
||||
storage = _get_storage()
|
||||
key = f"{patent.patent_id}.pdf"
|
||||
|
||||
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
|
||||
if not storage.exists(key):
|
||||
logger.info("Downloading PDF for %s", patent.patent_id)
|
||||
response = requests.get(patent.pdf_link)
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
logger.debug("Saved %d bytes to %s", len(response.content), pdf_path)
|
||||
storage.write(key, response.content)
|
||||
logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id)
|
||||
else:
|
||||
logger.debug("Using cached PDF for %s at %s", patent.patent_id, pdf_path)
|
||||
logger.debug("Using cached PDF for %s", patent.patent_id)
|
||||
|
||||
patent.pdf_path = pdf_path
|
||||
patent.pdf_path = storage.path_for(key)
|
||||
return patent
|
||||
|
||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||
"""Extract structured sections from patent PDF.
|
||||
|
||||
Extracts all major sections from a patent PDF including abstract,
|
||||
claims, summary, and detailed description.
|
||||
claims, summary, and detailed description. Supports both local file
|
||||
paths and S3 URIs (s3://bucket/key).
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the patent PDF file
|
||||
pdf_path: Local path or S3 URI to the patent PDF file
|
||||
|
||||
Returns:
|
||||
Dictionary containing all extracted sections
|
||||
"""
|
||||
|
||||
logger.debug("Parsing patent PDF: %s", pdf_path)
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
|
||||
if pdf_path.startswith("s3://"):
|
||||
# Read from S3 via storage backend
|
||||
storage = _get_storage()
|
||||
# Extract key from "s3://bucket/key"
|
||||
key = pdf_path.split("/", 3)[-1]
|
||||
data = storage.read(key)
|
||||
pdf_file: io.BytesIO | str = io.BytesIO(data)
|
||||
else:
|
||||
pdf_file = pdf_path
|
||||
|
||||
with pdfplumber.open(pdf_file) as pdf:
|
||||
# Extract all text
|
||||
full_text = ""
|
||||
for page in pdf.pages:
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
"""Patent PDF storage abstraction.
|
||||
|
||||
Provides a unified interface for reading and writing patent PDF files,
|
||||
with pluggable backends for local filesystem and S3-compatible object
|
||||
storage (e.g., MinIO, AWS S3).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from SPARC import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StorageBackend(ABC):
|
||||
"""Abstract base class for patent PDF storage."""
|
||||
|
||||
@abstractmethod
|
||||
def read(self, key: str) -> bytes:
|
||||
"""Read a file by key.
|
||||
|
||||
Args:
|
||||
key: Storage key (e.g., "US-12345678-B2.pdf")
|
||||
|
||||
Returns:
|
||||
File contents as bytes.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def write(self, key: str, data: bytes) -> None:
|
||||
"""Write data to storage.
|
||||
|
||||
Args:
|
||||
key: Storage key (e.g., "US-12345678-B2.pdf")
|
||||
data: File contents as bytes.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def exists(self, key: str) -> bool:
|
||||
"""Check if a file exists in storage.
|
||||
|
||||
Args:
|
||||
key: Storage key.
|
||||
|
||||
Returns:
|
||||
True if the file exists and has non-zero size.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def path_for(self, key: str) -> str:
|
||||
"""Return a path or URI suitable for downstream consumers.
|
||||
|
||||
For local storage this is a filesystem path; for S3 it is the
|
||||
object key (callers that need a local file should use read()
|
||||
and write to a temporary location).
|
||||
"""
|
||||
|
||||
|
||||
class LocalStorageBackend(StorageBackend):
|
||||
"""Store patent PDFs on the local filesystem under a directory."""
|
||||
|
||||
def __init__(self, base_dir: str = "patents"):
|
||||
self.base_dir = base_dir
|
||||
os.makedirs(self.base_dir, exist_ok=True)
|
||||
|
||||
def _full_path(self, key: str) -> str:
|
||||
return os.path.join(self.base_dir, key)
|
||||
|
||||
def read(self, key: str) -> bytes:
|
||||
path = self._full_path(key)
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
with open(path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
def write(self, key: str, data: bytes) -> None:
|
||||
path = self._full_path(key)
|
||||
os.makedirs(os.path.dirname(path) or self.base_dir, exist_ok=True)
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
logger.debug("Wrote %d bytes to %s", len(data), path)
|
||||
|
||||
def exists(self, key: str) -> bool:
|
||||
path = self._full_path(key)
|
||||
return os.path.exists(path) and os.path.getsize(path) > 0
|
||||
|
||||
def path_for(self, key: str) -> str:
|
||||
return self._full_path(key)
|
||||
|
||||
|
||||
class S3StorageBackend(StorageBackend):
|
||||
"""Store patent PDFs in an S3-compatible bucket."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bucket: str,
|
||||
endpoint_url: str = "",
|
||||
access_key: str = "",
|
||||
secret_key: str = "",
|
||||
):
|
||||
import boto3
|
||||
|
||||
kwargs: dict = {}
|
||||
if endpoint_url:
|
||||
kwargs["endpoint_url"] = endpoint_url
|
||||
if access_key and secret_key:
|
||||
kwargs["aws_access_key_id"] = access_key
|
||||
kwargs["aws_secret_access_key"] = secret_key
|
||||
|
||||
self.s3 = boto3.client("s3", **kwargs)
|
||||
self.bucket = bucket
|
||||
|
||||
# Ensure bucket exists (useful for MinIO local dev)
|
||||
try:
|
||||
self.s3.head_bucket(Bucket=self.bucket)
|
||||
except Exception:
|
||||
try:
|
||||
self.s3.create_bucket(Bucket=self.bucket)
|
||||
logger.info("Created S3 bucket: %s", self.bucket)
|
||||
except Exception as e:
|
||||
logger.warning("Could not create bucket %s: %s", self.bucket, e)
|
||||
|
||||
def read(self, key: str) -> bytes:
|
||||
try:
|
||||
response = self.s3.get_object(Bucket=self.bucket, Key=key)
|
||||
return response["Body"].read()
|
||||
except self.s3.exceptions.NoSuchKey:
|
||||
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
|
||||
except Exception as e:
|
||||
if "NoSuchKey" in str(e) or "404" in str(e):
|
||||
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
|
||||
raise
|
||||
|
||||
def write(self, key: str, data: bytes) -> None:
|
||||
self.s3.put_object(
|
||||
Bucket=self.bucket,
|
||||
Key=key,
|
||||
Body=data,
|
||||
ContentType="application/pdf",
|
||||
)
|
||||
logger.debug("Wrote %d bytes to s3://%s/%s", len(data), self.bucket, key)
|
||||
|
||||
def exists(self, key: str) -> bool:
|
||||
try:
|
||||
response = self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||
return response["ContentLength"] > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def path_for(self, key: str) -> str:
|
||||
return f"s3://{self.bucket}/{key}"
|
||||
|
||||
|
||||
def get_storage_backend() -> StorageBackend:
|
||||
"""Factory: return the configured storage backend instance."""
|
||||
backend = config.storage_backend.lower()
|
||||
if backend == "s3":
|
||||
logger.info("Using S3 storage backend (bucket=%s)", config.s3_bucket)
|
||||
return S3StorageBackend(
|
||||
bucket=config.s3_bucket,
|
||||
endpoint_url=config.s3_endpoint_url,
|
||||
access_key=config.s3_access_key,
|
||||
secret_key=config.s3_secret_key,
|
||||
)
|
||||
logger.info("Using local storage backend")
|
||||
return LocalStorageBackend()
|
||||
@@ -52,6 +52,29 @@ services:
|
||||
- ./patents:/app/patents
|
||||
restart: unless-stopped
|
||||
|
||||
# Optional: MinIO for S3-compatible local object storage
|
||||
# Enable by setting STORAGE_BACKEND=s3 in .env
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: sparc-minio
|
||||
command: server /data --console-address ":9001"
|
||||
environment:
|
||||
MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID:-minioadmin}
|
||||
MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY:-minioadmin}
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "mc", "ready", "local"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- s3
|
||||
|
||||
dashboard:
|
||||
build: ./frontend
|
||||
container_name: sparc-dashboard
|
||||
@@ -63,3 +86,4 @@ services:
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
minio_data:
|
||||
|
||||
@@ -16,3 +16,4 @@ bcrypt
|
||||
PyJWT
|
||||
slowapi
|
||||
apscheduler
|
||||
boto3
|
||||
|
||||
Reference in New Issue
Block a user