From 90f9cfc826cc0ae3adb4fcbd61133a209a77d521 Mon Sep 17 00:00:00 2001 From: 0xWheatyz Date: Tue, 24 Mar 2026 14:31:43 -0400 Subject: [PATCH] fix(serp): replace hardcoded date range with rolling window The SERP query had a frozen date range (Oct-Nov 2025) that returned stale patents. Now computes a rolling window from config (PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF caching to skip re-downloading existing patent PDFs, and adds PATENT_THREAD_WORKERS config for upcoming parallel processing. --- SPARC/config.py | 4 ++++ SPARC/serp_api.py | 32 ++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/SPARC/config.py b/SPARC/config.py index 1612b75..31bee7a 100644 --- a/SPARC/config.py +++ b/SPARC/config.py @@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes") # This variable is kept for backwards compatibility but has no effect use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes") +# Patent search configuration +patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90")) +patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5")) + # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/) # This ensures OpenAPI docs work correctly when accessed via the proxy root_path = os.getenv("ROOT_PATH", "") diff --git a/SPARC/serp_api.py b/SPARC/serp_api.py index 5959c08..b4254d0 100644 --- a/SPARC/serp_api.py +++ b/SPARC/serp_api.py @@ -1,17 +1,20 @@ +import os import serpapi from SPARC import config import re import pdfplumber # pip install pdfplumber import requests +from datetime import datetime, timedelta from typing import Dict from SPARC.types import Patents, Patent class SERP: - def query(company: str) -> Patents: + def query(company: str, days_back: int = None) -> Patents: """Query Google Patents for a company's recent patents. Args: company: Name of the company to search for + days_back: Number of days to look back for patents (default from config) Returns: Patents object containing list of patents with PDF links @@ -23,13 +26,19 @@ class SERP: patents with restricted access). The returned count may be lower than the requested number of results. """ + if days_back is None: + days_back = config.patent_search_days + end_date = datetime.now() + start_date = end_date - timedelta(days=days_back) + date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}" + # Make API call params = { "engine": "google_patents", "q": company, "num": 10, "filter": 1, - "tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025", + "tbs": date_filter, "api_key": config.api_key, } search = serpapi.search(params) @@ -46,20 +55,23 @@ class SERP: def save_patents(patent: Patent) -> Patent: """ - Save the patent PDF to the patents folder - + Save the patent PDF to the patents folder, skipping download if already cached. + Args: patent: Patent object Returns: Patent object with updated PDF path """ - response = requests.get(patent.pdf_link) - print(patent.pdf_link) - with open(f"patents/{patent.patent_id}.pdf", "wb") as f: - f.write(response.content) - - patent.pdf_path = f"patents/{patent.patent_id}.pdf" + pdf_path = f"patents/{patent.patent_id}.pdf" + os.makedirs("patents", exist_ok=True) + + if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0): + response = requests.get(patent.pdf_link) + with open(pdf_path, "wb") as f: + f.write(response.content) + + patent.pdf_path = pdf_path return patent def parse_patent_pdf(pdf_path: str) -> Dict: