fix(serp): replace hardcoded date range with rolling window

The SERP query had a frozen date range (Oct-Nov 2025) that returned
stale patents. Now computes a rolling window from config
(PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF
caching to skip re-downloading existing patent PDFs, and adds
PATENT_THREAD_WORKERS config for upcoming parallel processing.
This commit is contained in:
2026-03-24 14:31:43 -04:00
parent d387bbbdf3
commit 90f9cfc826
2 changed files with 26 additions and 10 deletions
+4
View File
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
# This variable is kept for backwards compatibility but has no effect # This variable is kept for backwards compatibility but has no effect
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes") use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
# Patent search configuration
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/) # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
# This ensures OpenAPI docs work correctly when accessed via the proxy # This ensures OpenAPI docs work correctly when accessed via the proxy
root_path = os.getenv("ROOT_PATH", "") root_path = os.getenv("ROOT_PATH", "")
+22 -10
View File
@@ -1,17 +1,20 @@
import os
import serpapi import serpapi
from SPARC import config from SPARC import config
import re import re
import pdfplumber # pip install pdfplumber import pdfplumber # pip install pdfplumber
import requests import requests
from datetime import datetime, timedelta
from typing import Dict from typing import Dict
from SPARC.types import Patents, Patent from SPARC.types import Patents, Patent
class SERP: class SERP:
def query(company: str) -> Patents: def query(company: str, days_back: int = None) -> Patents:
"""Query Google Patents for a company's recent patents. """Query Google Patents for a company's recent patents.
Args: Args:
company: Name of the company to search for company: Name of the company to search for
days_back: Number of days to look back for patents (default from config)
Returns: Returns:
Patents object containing list of patents with PDF links Patents object containing list of patents with PDF links
@@ -23,13 +26,19 @@ class SERP:
patents with restricted access). The returned count may be lower patents with restricted access). The returned count may be lower
than the requested number of results. than the requested number of results.
""" """
if days_back is None:
days_back = config.patent_search_days
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
# Make API call # Make API call
params = { params = {
"engine": "google_patents", "engine": "google_patents",
"q": company, "q": company,
"num": 10, "num": 10,
"filter": 1, "filter": 1,
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025", "tbs": date_filter,
"api_key": config.api_key, "api_key": config.api_key,
} }
search = serpapi.search(params) search = serpapi.search(params)
@@ -46,20 +55,23 @@ class SERP:
def save_patents(patent: Patent) -> Patent: def save_patents(patent: Patent) -> Patent:
""" """
Save the patent PDF to the patents folder Save the patent PDF to the patents folder, skipping download if already cached.
Args: Args:
patent: Patent object patent: Patent object
Returns: Returns:
Patent object with updated PDF path Patent object with updated PDF path
""" """
response = requests.get(patent.pdf_link) pdf_path = f"patents/{patent.patent_id}.pdf"
print(patent.pdf_link) os.makedirs("patents", exist_ok=True)
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
f.write(response.content) if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
response = requests.get(patent.pdf_link)
patent.pdf_path = f"patents/{patent.patent_id}.pdf" with open(pdf_path, "wb") as f:
f.write(response.content)
patent.pdf_path = pdf_path
return patent return patent
def parse_patent_pdf(pdf_path: str) -> Dict: def parse_patent_pdf(pdf_path: str) -> Dict: