From 9a43f852599c8b090d67c5f9ca8973f86a41efc7 Mon Sep 17 00:00:00 2001 From: agent-company Date: Thu, 26 Mar 2026 10:17:24 +0000 Subject: [PATCH] feat: add S3/MinIO object storage support for patent PDFs Introduce a StorageBackend abstraction (local filesystem and S3) for patent PDF storage. When STORAGE_BACKEND=s3, PDFs are read/written via boto3 to an S3-compatible bucket instead of the local filesystem. - Add SPARC/storage.py with LocalStorageBackend and S3StorageBackend - Update serp_api.py save_patents and parse_patent_pdf to use storage - Add storage config vars to config.py and .env.example - Add optional MinIO service to docker-compose.yml (--profile s3) - Add boto3 to requirements.txt Closes leeworks-agents/SPARC#38 Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 12 ++++ SPARC/config.py | 7 ++ SPARC/serp_api.py | 55 +++++++++++---- SPARC/storage.py | 171 +++++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 24 +++++++ requirements.txt | 1 + 6 files changed, 258 insertions(+), 12 deletions(-) create mode 100644 SPARC/storage.py diff --git a/.env.example b/.env.example index 4e78c43..71df3b5 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,18 @@ JWT_SECRET=your-secure-jwt-secret-change-in-production # Defaults to http://localhost:3000,http://localhost:5173 when unset # CORS_ORIGINS=https://sparc.example.com,https://app.example.com +# ---- Storage ---- + +# Backend for patent PDF storage: "local" (default) or "s3" +STORAGE_BACKEND=local + +# S3/MinIO settings (only used when STORAGE_BACKEND=s3) +# S3_BUCKET=sparc-patents +# S3_ENDPOINT_URL=http://localhost:9000 +# AWS_ACCESS_KEY_ID=minioadmin +# AWS_SECRET_ACCESS_KEY=minioadmin +# To start MinIO locally: docker compose --profile s3 up -d minio + # ---- Cache ---- # When USE_CACHE=true: check database for cached responses before making API calls diff --git a/SPARC/config.py b/SPARC/config.py index e6f6173..4d89742 100644 --- a/SPARC/config.py +++ b/SPARC/config.py @@ -53,6 +53,13 @@ root_path = os.getenv("ROOT_PATH", "") # Used for safety checks (e.g., refusing default JWT secret in production) app_env = os.getenv("APP_ENV", "development") +# Storage backend: "local" (default) or "s3" for S3/MinIO object storage +storage_backend = os.getenv("STORAGE_BACKEND", "local") +s3_bucket = os.getenv("S3_BUCKET", "sparc-patents") +s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "") +s3_access_key = os.getenv("AWS_ACCESS_KEY_ID", "") +s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "") + # CORS allowed origins (comma-separated) # Defaults to localhost dev origins when unset _cors_origins_raw = os.getenv("CORS_ORIGINS", "") diff --git a/SPARC/serp_api.py b/SPARC/serp_api.py index cb6a8af..af48039 100644 --- a/SPARC/serp_api.py +++ b/SPARC/serp_api.py @@ -1,4 +1,5 @@ -import os +import io +import logging import re from datetime import datetime, timedelta from typing import Dict @@ -8,8 +9,21 @@ import requests import serpapi from SPARC import config +from SPARC.storage import StorageBackend, get_storage_backend from SPARC.types import Patent, Patents +logger = logging.getLogger(__name__) + +# Module-level storage instance (lazy-initialized) +_storage: StorageBackend | None = None + + +def _get_storage() -> StorageBackend: + global _storage + if _storage is None: + _storage = get_storage_backend() + return _storage + class SERP: def query(company: str, days_back: int = None) -> Patents: @@ -57,8 +71,9 @@ class SERP: return Patents(patents=patent_ids) def save_patents(patent: Patent) -> Patent: - """ - Save the patent PDF to the patents folder, skipping download if already cached. + """Save the patent PDF to storage, skipping download if already cached. + + Uses the configured storage backend (local filesystem or S3). Args: patent: Patent object @@ -66,35 +81,51 @@ class SERP: Returns: Patent object with updated PDF path """ - pdf_path = f"patents/{patent.patent_id}.pdf" - os.makedirs("patents", exist_ok=True) + storage = _get_storage() + key = f"{patent.patent_id}.pdf" - if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0): + if not storage.exists(key): + logger.info("Downloading PDF for %s", patent.patent_id) response = requests.get(patent.pdf_link) - with open(pdf_path, "wb") as f: - f.write(response.content) + storage.write(key, response.content) + logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id) + else: + logger.debug("Using cached PDF for %s", patent.patent_id) - patent.pdf_path = pdf_path + patent.pdf_path = storage.path_for(key) return patent def parse_patent_pdf(pdf_path: str) -> Dict: """Extract structured sections from patent PDF. Extracts all major sections from a patent PDF including abstract, - claims, summary, and detailed description. + claims, summary, and detailed description. Supports both local file + paths and S3 URIs (s3://bucket/key). Args: - pdf_path: Path to the patent PDF file + pdf_path: Local path or S3 URI to the patent PDF file Returns: Dictionary containing all extracted sections """ + logger.debug("Parsing patent PDF: %s", pdf_path) - with pdfplumber.open(pdf_path) as pdf: + if pdf_path.startswith("s3://"): + # Read from S3 via storage backend + storage = _get_storage() + # Extract key from "s3://bucket/key" + key = pdf_path.split("/", 3)[-1] + data = storage.read(key) + pdf_file: io.BytesIO | str = io.BytesIO(data) + else: + pdf_file = pdf_path + + with pdfplumber.open(pdf_file) as pdf: # Extract all text full_text = "" for page in pdf.pages: full_text += page.extract_text() + "\n" + logger.debug("Extracted text from %d pages (%d chars)", len(pdf.pages), len(full_text)) # Define section patterns (common in patents) sections = { diff --git a/SPARC/storage.py b/SPARC/storage.py new file mode 100644 index 0000000..5159dd6 --- /dev/null +++ b/SPARC/storage.py @@ -0,0 +1,171 @@ +"""Patent PDF storage abstraction. + +Provides a unified interface for reading and writing patent PDF files, +with pluggable backends for local filesystem and S3-compatible object +storage (e.g., MinIO, AWS S3). +""" + +import logging +import os +from abc import ABC, abstractmethod + +from SPARC import config + +logger = logging.getLogger(__name__) + + +class StorageBackend(ABC): + """Abstract base class for patent PDF storage.""" + + @abstractmethod + def read(self, key: str) -> bytes: + """Read a file by key. + + Args: + key: Storage key (e.g., "US-12345678-B2.pdf") + + Returns: + File contents as bytes. + + Raises: + FileNotFoundError: If the file does not exist. + """ + + @abstractmethod + def write(self, key: str, data: bytes) -> None: + """Write data to storage. + + Args: + key: Storage key (e.g., "US-12345678-B2.pdf") + data: File contents as bytes. + """ + + @abstractmethod + def exists(self, key: str) -> bool: + """Check if a file exists in storage. + + Args: + key: Storage key. + + Returns: + True if the file exists and has non-zero size. + """ + + @abstractmethod + def path_for(self, key: str) -> str: + """Return a path or URI suitable for downstream consumers. + + For local storage this is a filesystem path; for S3 it is the + object key (callers that need a local file should use read() + and write to a temporary location). + """ + + +class LocalStorageBackend(StorageBackend): + """Store patent PDFs on the local filesystem under a directory.""" + + def __init__(self, base_dir: str = "patents"): + self.base_dir = base_dir + os.makedirs(self.base_dir, exist_ok=True) + + def _full_path(self, key: str) -> str: + return os.path.join(self.base_dir, key) + + def read(self, key: str) -> bytes: + path = self._full_path(key) + if not os.path.exists(path): + raise FileNotFoundError(f"File not found: {path}") + with open(path, "rb") as f: + return f.read() + + def write(self, key: str, data: bytes) -> None: + path = self._full_path(key) + os.makedirs(os.path.dirname(path) or self.base_dir, exist_ok=True) + with open(path, "wb") as f: + f.write(data) + logger.debug("Wrote %d bytes to %s", len(data), path) + + def exists(self, key: str) -> bool: + path = self._full_path(key) + return os.path.exists(path) and os.path.getsize(path) > 0 + + def path_for(self, key: str) -> str: + return self._full_path(key) + + +class S3StorageBackend(StorageBackend): + """Store patent PDFs in an S3-compatible bucket.""" + + def __init__( + self, + bucket: str, + endpoint_url: str = "", + access_key: str = "", + secret_key: str = "", + ): + import boto3 + + kwargs: dict = {} + if endpoint_url: + kwargs["endpoint_url"] = endpoint_url + if access_key and secret_key: + kwargs["aws_access_key_id"] = access_key + kwargs["aws_secret_access_key"] = secret_key + + self.s3 = boto3.client("s3", **kwargs) + self.bucket = bucket + + # Ensure bucket exists (useful for MinIO local dev) + try: + self.s3.head_bucket(Bucket=self.bucket) + except Exception: + try: + self.s3.create_bucket(Bucket=self.bucket) + logger.info("Created S3 bucket: %s", self.bucket) + except Exception as e: + logger.warning("Could not create bucket %s: %s", self.bucket, e) + + def read(self, key: str) -> bytes: + try: + response = self.s3.get_object(Bucket=self.bucket, Key=key) + return response["Body"].read() + except self.s3.exceptions.NoSuchKey: + raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}") + except Exception as e: + if "NoSuchKey" in str(e) or "404" in str(e): + raise FileNotFoundError(f"S3 object not found: s3://{self.bucket}/{key}") + raise + + def write(self, key: str, data: bytes) -> None: + self.s3.put_object( + Bucket=self.bucket, + Key=key, + Body=data, + ContentType="application/pdf", + ) + logger.debug("Wrote %d bytes to s3://%s/%s", len(data), self.bucket, key) + + def exists(self, key: str) -> bool: + try: + response = self.s3.head_object(Bucket=self.bucket, Key=key) + return response["ContentLength"] > 0 + except Exception: + return False + + def path_for(self, key: str) -> str: + return f"s3://{self.bucket}/{key}" + + +def get_storage_backend() -> StorageBackend: + """Factory: return the configured storage backend instance.""" + backend = config.storage_backend.lower() + if backend == "s3": + logger.info("Using S3 storage backend (bucket=%s)", config.s3_bucket) + return S3StorageBackend( + bucket=config.s3_bucket, + endpoint_url=config.s3_endpoint_url, + access_key=config.s3_access_key, + secret_key=config.s3_secret_key, + ) + logger.info("Using local storage backend") + return LocalStorageBackend() diff --git a/docker-compose.yml b/docker-compose.yml index fa42f8c..95cc313 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -52,6 +52,29 @@ services: - ./patents:/app/patents restart: unless-stopped + # Optional: MinIO for S3-compatible local object storage + # Enable by setting STORAGE_BACKEND=s3 in .env + minio: + image: minio/minio:latest + container_name: sparc-minio + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID:-minioadmin} + MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY:-minioadmin} + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 10s + timeout: 5s + retries: 3 + restart: unless-stopped + profiles: + - s3 + dashboard: build: ./frontend container_name: sparc-dashboard @@ -63,3 +86,4 @@ services: volumes: postgres_data: + minio_data: diff --git a/requirements.txt b/requirements.txt index e854576..ad2637d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ pandas bcrypt PyJWT slowapi +boto3