From 90f9cfc826cc0ae3adb4fcbd61133a209a77d521 Mon Sep 17 00:00:00 2001
From: 0xWheatyz <wyatt@leeworks.dev>
Date: Tue, 24 Mar 2026 14:31:43 -0400
Subject: [PATCH] fix(serp): replace hardcoded date range with rolling window

The SERP query had a frozen date range (Oct-Nov 2025) that returned
stale patents. Now computes a rolling window from config
(PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF
caching to skip re-downloading existing patent PDFs, and adds
PATENT_THREAD_WORKERS config for upcoming parallel processing.
---
 SPARC/config.py   |  4 ++++
 SPARC/serp_api.py | 32 ++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/SPARC/config.py b/SPARC/config.py
index 1612b75..31bee7a 100644
--- a/SPARC/config.py
+++ b/SPARC/config.py
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
 # This variable is kept for backwards compatibility but has no effect
 use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
 
+# Patent search configuration
+patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
+patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
+
 # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
 # This ensures OpenAPI docs work correctly when accessed via the proxy
 root_path = os.getenv("ROOT_PATH", "")
diff --git a/SPARC/serp_api.py b/SPARC/serp_api.py
index 5959c08..b4254d0 100644
--- a/SPARC/serp_api.py
+++ b/SPARC/serp_api.py
@@ -1,17 +1,20 @@
+import os
 import serpapi
 from SPARC import config
 import re
 import pdfplumber  # pip install pdfplumber
 import requests
+from datetime import datetime, timedelta
 from typing import Dict
 from SPARC.types import Patents, Patent
 
 class SERP:
-  def query(company: str) -> Patents:
+  def query(company: str, days_back: int = None) -> Patents:
     """Query Google Patents for a company's recent patents.
 
     Args:
         company: Name of the company to search for
+        days_back: Number of days to look back for patents (default from config)
 
     Returns:
         Patents object containing list of patents with PDF links
@@ -23,13 +26,19 @@ class SERP:
         patents with restricted access). The returned count may be lower
         than the requested number of results.
     """
+    if days_back is None:
+        days_back = config.patent_search_days
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=days_back)
+    date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
+
     # Make API call
     params = {
       "engine": "google_patents",
       "q": company,
       "num": 10,
       "filter": 1,
-      "tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
+      "tbs": date_filter,
       "api_key": config.api_key,
     }
     search = serpapi.search(params)
@@ -46,20 +55,23 @@ class SERP:
 
   def save_patents(patent: Patent) -> Patent:
     """
-    Save the patent PDF to the patents folder
-    
+    Save the patent PDF to the patents folder, skipping download if already cached.
+
     Args:
       patent: Patent object
 
     Returns:
       Patent object with updated PDF path
     """
-    response = requests.get(patent.pdf_link)
-    print(patent.pdf_link)
-    with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
-      f.write(response.content)
-    
-    patent.pdf_path = f"patents/{patent.patent_id}.pdf"
+    pdf_path = f"patents/{patent.patent_id}.pdf"
+    os.makedirs("patents", exist_ok=True)
+
+    if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
+      response = requests.get(patent.pdf_link)
+      with open(pdf_path, "wb") as f:
+        f.write(response.content)
+
+    patent.pdf_path = pdf_path
     return patent
 
   def parse_patent_pdf(pdf_path: str) -> Dict: