deploy: security hardening, multi-model support, S3 storage, analytics, CI improvements (70 commits) #4
@@ -35,6 +35,18 @@ JWT_SECRET=your-secure-jwt-secret-change-in-production
|
|||||||
# Defaults to http://localhost:3000,http://localhost:5173 when unset
|
# Defaults to http://localhost:3000,http://localhost:5173 when unset
|
||||||
# CORS_ORIGINS=https://sparc.example.com,https://app.example.com
|
# CORS_ORIGINS=https://sparc.example.com,https://app.example.com
|
||||||
|
|
||||||
|
# ---- Storage ----
|
||||||
|
|
||||||
|
# Backend for patent PDF storage: "local" (default) or "s3"
|
||||||
|
STORAGE_BACKEND=local
|
||||||
|
|
||||||
|
# S3/MinIO settings (only used when STORAGE_BACKEND=s3)
|
||||||
|
# S3_BUCKET=sparc-patents
|
||||||
|
# S3_ENDPOINT_URL=http://localhost:9000
|
||||||
|
# AWS_ACCESS_KEY_ID=minioadmin
|
||||||
|
# AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
|
# To start MinIO locally: docker compose --profile s3 up -d minio
|
||||||
|
|
||||||
# ---- Cache ----
|
# ---- Cache ----
|
||||||
|
|
||||||
# When USE_CACHE=true: check database for cached responses before making API calls
|
# When USE_CACHE=true: check database for cached responses before making API calls
|
||||||
|
|||||||
@@ -53,6 +53,13 @@ root_path = os.getenv("ROOT_PATH", "")
|
|||||||
# Used for safety checks (e.g., refusing default JWT secret in production)
|
# Used for safety checks (e.g., refusing default JWT secret in production)
|
||||||
app_env = os.getenv("APP_ENV", "development")
|
app_env = os.getenv("APP_ENV", "development")
|
||||||
|
|
||||||
|
# Storage backend: "local" (default) or "s3" for S3/MinIO object storage
|
||||||
|
storage_backend = os.getenv("STORAGE_BACKEND", "local")
|
||||||
|
s3_bucket = os.getenv("S3_BUCKET", "sparc-patents")
|
||||||
|
s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "")
|
||||||
|
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID", "")
|
||||||
|
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "")
|
||||||
|
|
||||||
# CORS allowed origins (comma-separated)
|
# CORS allowed origins (comma-separated)
|
||||||
# Defaults to localhost dev origins when unset
|
# Defaults to localhost dev origins when unset
|
||||||
_cors_origins_raw = os.getenv("CORS_ORIGINS", "")
|
_cors_origins_raw = os.getenv("CORS_ORIGINS", "")
|
||||||
|
|||||||
+43
-12
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import io
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
@@ -8,8 +9,21 @@ import requests
|
|||||||
import serpapi
|
import serpapi
|
||||||
|
|
||||||
from SPARC import config
|
from SPARC import config
|
||||||
|
from SPARC.storage import StorageBackend, get_storage_backend
|
||||||
from SPARC.types import Patent, Patents
|
from SPARC.types import Patent, Patents
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Module-level storage instance (lazy-initialized)
|
||||||
|
_storage: StorageBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_storage() -> StorageBackend:
|
||||||
|
global _storage
|
||||||
|
if _storage is None:
|
||||||
|
_storage = get_storage_backend()
|
||||||
|
return _storage
|
||||||
|
|
||||||
|
|
||||||
class SERP:
|
class SERP:
|
||||||
def query(company: str, days_back: int = None) -> Patents:
|
def query(company: str, days_back: int = None) -> Patents:
|
||||||
@@ -57,8 +71,9 @@ class SERP:
|
|||||||
return Patents(patents=patent_ids)
|
return Patents(patents=patent_ids)
|
||||||
|
|
||||||
def save_patents(patent: Patent) -> Patent:
|
def save_patents(patent: Patent) -> Patent:
|
||||||
"""
|
"""Save the patent PDF to storage, skipping download if already cached.
|
||||||
Save the patent PDF to the patents folder, skipping download if already cached.
|
|
||||||
|
Uses the configured storage backend (local filesystem or S3).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
patent: Patent object
|
patent: Patent object
|
||||||
@@ -66,35 +81,51 @@ class SERP:
|
|||||||
Returns:
|
Returns:
|
||||||
Patent object with updated PDF path
|
Patent object with updated PDF path
|
||||||
"""
|
"""
|
||||||
pdf_path = f"patents/{patent.patent_id}.pdf"
|
storage = _get_storage()
|
||||||
os.makedirs("patents", exist_ok=True)
|
key = f"{patent.patent_id}.pdf"
|
||||||
|
|
||||||
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
|
if not storage.exists(key):
|
||||||
|
logger.info("Downloading PDF for %s", patent.patent_id)
|
||||||
response = requests.get(patent.pdf_link)
|
response = requests.get(patent.pdf_link)
|
||||||
with open(pdf_path, "wb") as f:
|
storage.write(key, response.content)
|
||||||
f.write(response.content)
|
logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id)
|
||||||
|
else:
|
||||||
|
logger.debug("Using cached PDF for %s", patent.patent_id)
|
||||||
|
|
||||||
patent.pdf_path = pdf_path
|
patent.pdf_path = storage.path_for(key)
|
||||||
return patent
|
return patent
|
||||||
|
|
||||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||||
"""Extract structured sections from patent PDF.
|
"""Extract structured sections from patent PDF.
|
||||||
|
|
||||||
Extracts all major sections from a patent PDF including abstract,
|
Extracts all major sections from a patent PDF including abstract,
|
||||||
claims, summary, and detailed description.
|
claims, summary, and detailed description. Supports both local file
|
||||||
|
paths and S3 URIs (s3://bucket/key).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path: Path to the patent PDF file
|
pdf_path: Local path or S3 URI to the patent PDF file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary containing all extracted sections
|
Dictionary containing all extracted sections
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Parsing patent PDF: %s", pdf_path)
|
||||||
|
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
if pdf_path.startswith("s3://"):
|
||||||
|
# Read from S3 via storage backend
|
||||||
|
storage = _get_storage()
|
||||||
|
# Extract key from "s3://bucket/key"
|
||||||
|
key = pdf_path.split("/", 3)[-1]
|
||||||
|
data = storage.read(key)
|
||||||
|
pdf_file: io.BytesIO | str = io.BytesIO(data)
|
||||||
|
else:
|
||||||
|
pdf_file = pdf_path
|
||||||
|
|
||||||
|
with pdfplumber.open(pdf_file) as pdf:
|
||||||
# Extract all text
|
# Extract all text
|
||||||
full_text = ""
|
full_text = ""
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
full_text += page.extract_text() + "\n"
|
full_text += page.extract_text() + "\n"
|
||||||
|
logger.debug("Extracted text from %d pages (%d chars)", len(pdf.pages), len(full_text))
|
||||||
|
|
||||||
# Define section patterns (common in patents)
|
# Define section patterns (common in patents)
|
||||||
sections = {
|
sections = {
|
||||||
|
|||||||
@@ -0,0 +1,171 @@
|
|||||||
|
"""Patent PDF storage abstraction.
|
||||||
|
|
||||||
|
Provides a unified interface for reading and writing patent PDF files,
|
||||||
|
with pluggable backends for local filesystem and S3-compatible object
|
||||||
|
storage (e.g., MinIO, AWS S3).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from SPARC import config
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class StorageBackend(ABC):
|
||||||
|
"""Abstract base class for patent PDF storage."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def read(self, key: str) -> bytes:
|
||||||
|
"""Read a file by key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: Storage key (e.g., "US-12345678-B2.pdf")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
File contents as bytes.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the file does not exist.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def write(self, key: str, data: bytes) -> None:
|
||||||
|
"""Write data to storage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: Storage key (e.g., "US-12345678-B2.pdf")
|
||||||
|
data: File contents as bytes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def exists(self, key: str) -> bool:
|
||||||
|
"""Check if a file exists in storage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: Storage key.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the file exists and has non-zero size.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def path_for(self, key: str) -> str:
|
||||||
|
"""Return a path or URI suitable for downstream consumers.
|
||||||
|
|
||||||
|
For local storage this is a filesystem path; for S3 it is the
|
||||||
|
object key (callers that need a local file should use read()
|
||||||
|
and write to a temporary location).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class LocalStorageBackend(StorageBackend):
|
||||||
|
"""Store patent PDFs on the local filesystem under a directory."""
|
||||||
|
|
||||||
|
def __init__(self, base_dir: str = "patents"):
|
||||||
|
self.base_dir = base_dir
|
||||||
|
os.makedirs(self.base_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _full_path(self, key: str) -> str:
|
||||||
|
return os.path.join(self.base_dir, key)
|
||||||
|
|
||||||
|
def read(self, key: str) -> bytes:
|
||||||
|
path = self._full_path(key)
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise FileNotFoundError(f"File not found: {path}")
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def write(self, key: str, data: bytes) -> None:
|
||||||
|
path = self._full_path(key)
|
||||||
|
os.makedirs(os.path.dirname(path) or self.base_dir, exist_ok=True)
|
||||||
|
with open(path, "wb") as f:
|
||||||
|
f.write(data)
|
||||||
|
logger.debug("Wrote %d bytes to %s", len(data), path)
|
||||||
|
|
||||||
|
def exists(self, key: str) -> bool:
|
||||||
|
path = self._full_path(key)
|
||||||
|
return os.path.exists(path) and os.path.getsize(path) > 0
|
||||||
|
|
||||||
|
def path_for(self, key: str) -> str:
|
||||||
|
return self._full_path(key)
|
||||||
|
|
||||||
|
|
||||||
|
class S3StorageBackend(StorageBackend):
|
||||||
|
"""Store patent PDFs in an S3-compatible bucket."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
bucket: str,
|
||||||
|
endpoint_url: str = "",
|
||||||
|
access_key: str = "",
|
||||||
|
secret_key: str = "",
|
||||||
|
):
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
kwargs: dict = {}
|
||||||
|
if endpoint_url:
|
||||||
|
kwargs["endpoint_url"] = endpoint_url
|
||||||
|
if access_key and secret_key:
|
||||||
|
kwargs["aws_access_key_id"] = access_key
|
||||||
|
kwargs["aws_secret_access_key"] = secret_key
|
||||||
|
|
||||||
|
self.s3 = boto3.client("s3", **kwargs)
|
||||||
|
self.bucket = bucket
|
||||||
|
|
||||||
|
# Ensure bucket exists (useful for MinIO local dev)
|
||||||
|
try:
|
||||||
|
self.s3.head_bucket(Bucket=self.bucket)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
self.s3.create_bucket(Bucket=self.bucket)
|
||||||
|
logger.info("Created S3 bucket: %s", self.bucket)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Could not create bucket %s: %s", self.bucket, e)
|
||||||
|
|
||||||
|
def read(self, key: str) -> bytes:
|
||||||
|
try:
|
||||||
|
response = self.s3.get_object(Bucket=self.bucket, Key=key)
|
||||||
|
return response["Body"].read()
|
||||||
|
except self.s3.exceptions.NoSuchKey:
|
||||||
|
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
|
||||||
|
except Exception as e:
|
||||||
|
if "NoSuchKey" in str(e) or "404" in str(e):
|
||||||
|
raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def write(self, key: str, data: bytes) -> None:
|
||||||
|
self.s3.put_object(
|
||||||
|
Bucket=self.bucket,
|
||||||
|
Key=key,
|
||||||
|
Body=data,
|
||||||
|
ContentType="application/pdf",
|
||||||
|
)
|
||||||
|
logger.debug("Wrote %d bytes to s3://%s/%s", len(data), self.bucket, key)
|
||||||
|
|
||||||
|
def exists(self, key: str) -> bool:
|
||||||
|
try:
|
||||||
|
response = self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||||
|
return response["ContentLength"] > 0
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def path_for(self, key: str) -> str:
|
||||||
|
return f"s3://{self.bucket}/{key}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_storage_backend() -> StorageBackend:
|
||||||
|
"""Factory: return the configured storage backend instance."""
|
||||||
|
backend = config.storage_backend.lower()
|
||||||
|
if backend == "s3":
|
||||||
|
logger.info("Using S3 storage backend (bucket=%s)", config.s3_bucket)
|
||||||
|
return S3StorageBackend(
|
||||||
|
bucket=config.s3_bucket,
|
||||||
|
endpoint_url=config.s3_endpoint_url,
|
||||||
|
access_key=config.s3_access_key,
|
||||||
|
secret_key=config.s3_secret_key,
|
||||||
|
)
|
||||||
|
logger.info("Using local storage backend")
|
||||||
|
return LocalStorageBackend()
|
||||||
@@ -52,6 +52,29 @@ services:
|
|||||||
- ./patents:/app/patents
|
- ./patents:/app/patents
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Optional: MinIO for S3-compatible local object storage
|
||||||
|
# Enable by setting STORAGE_BACKEND=s3 in .env
|
||||||
|
minio:
|
||||||
|
image: minio/minio:latest
|
||||||
|
container_name: sparc-minio
|
||||||
|
command: server /data --console-address ":9001"
|
||||||
|
environment:
|
||||||
|
MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID:-minioadmin}
|
||||||
|
MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY:-minioadmin}
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
|
- "9001:9001"
|
||||||
|
volumes:
|
||||||
|
- minio_data:/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "mc", "ready", "local"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
restart: unless-stopped
|
||||||
|
profiles:
|
||||||
|
- s3
|
||||||
|
|
||||||
dashboard:
|
dashboard:
|
||||||
build: ./frontend
|
build: ./frontend
|
||||||
container_name: sparc-dashboard
|
container_name: sparc-dashboard
|
||||||
@@ -63,3 +86,4 @@ services:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
postgres_data:
|
postgres_data:
|
||||||
|
minio_data:
|
||||||
|
|||||||
@@ -15,3 +15,4 @@ pandas
|
|||||||
bcrypt
|
bcrypt
|
||||||
PyJWT
|
PyJWT
|
||||||
slowapi
|
slowapi
|
||||||
|
boto3
|
||||||
|
|||||||
Reference in New Issue
Block a user