fix(serp): replace hardcoded date range with rolling window

The SERP query had a frozen date range (Oct-Nov 2025) that returned stale patents. Now computes a rolling window from config (PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF caching to skip re-downloading existing patent PDFs, and adds PATENT_THREAD_WORKERS config for upcoming parallel processing.
2026-03-24 14:31:43 -04:00
parent d387bbbdf3
commit 90f9cfc826
2 changed files with 26 additions and 10 deletions
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
 # This variable is kept for backwards compatibility but has no effect
 use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")

+# Patent search configuration
+patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
+patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
+
 # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
 # This ensures OpenAPI docs work correctly when accessed via the proxy
 root_path = os.getenv("ROOT_PATH", "")
@@ -1,17 +1,20 @@
+import os
 import serpapi
 from SPARC import config
 import re
 import pdfplumber  # pip install pdfplumber
 import requests
+from datetime import datetime, timedelta
 from typing import Dict
 from SPARC.types import Patents, Patent

 class SERP:
-  def query(company: str) -> Patents:
+  def query(company: str, days_back: int = None) -> Patents:
    """Query Google Patents for a company's recent patents.

    Args:
        company: Name of the company to search for
+        days_back: Number of days to look back for patents (default from config)

    Returns:
        Patents object containing list of patents with PDF links
@@ -23,13 +26,19 @@ class SERP:
        patents with restricted access). The returned count may be lower
        than the requested number of results.
    """
+    if days_back is None:
+        days_back = config.patent_search_days
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=days_back)
+    date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
+
    # Make API call
    params = {
      "engine": "google_patents",
      "q": company,
      "num": 10,
      "filter": 1,
-      "tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
+      "tbs": date_filter,
      "api_key": config.api_key,
    }
    search = serpapi.search(params)
@@ -46,7 +55,7 @@ class SERP:

  def save_patents(patent: Patent) -> Patent:
    """
-    Save the patent PDF to the patents folder
+    Save the patent PDF to the patents folder, skipping download if already cached.

    Args:
      patent: Patent object
@@ -54,12 +63,15 @@ class SERP:
    Returns:
      Patent object with updated PDF path
    """
+    pdf_path = f"patents/{patent.patent_id}.pdf"
+    os.makedirs("patents", exist_ok=True)
+
+    if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
      response = requests.get(patent.pdf_link)
-    print(patent.pdf_link)
-    with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
+      with open(pdf_path, "wb") as f:
        f.write(response.content)

-    patent.pdf_path = f"patents/{patent.patent_id}.pdf"
+    patent.pdf_path = pdf_path
    return patent

  def parse_patent_pdf(pdf_path: str) -> Dict: