fix(serp): replace hardcoded date range with rolling window
The SERP query had a frozen date range (Oct-Nov 2025) that returned stale patents. Now computes a rolling window from config (PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF caching to skip re-downloading existing patent PDFs, and adds PATENT_THREAD_WORKERS config for upcoming parallel processing.
This commit is contained in:
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
|
||||
# This variable is kept for backwards compatibility but has no effect
|
||||
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
|
||||
|
||||
# Patent search configuration
|
||||
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
|
||||
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
|
||||
|
||||
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
|
||||
# This ensures OpenAPI docs work correctly when accessed via the proxy
|
||||
root_path = os.getenv("ROOT_PATH", "")
|
||||
|
||||
+18
-6
@@ -1,17 +1,20 @@
|
||||
import os
|
||||
import serpapi
|
||||
from SPARC import config
|
||||
import re
|
||||
import pdfplumber # pip install pdfplumber
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict
|
||||
from SPARC.types import Patents, Patent
|
||||
|
||||
class SERP:
|
||||
def query(company: str) -> Patents:
|
||||
def query(company: str, days_back: int = None) -> Patents:
|
||||
"""Query Google Patents for a company's recent patents.
|
||||
|
||||
Args:
|
||||
company: Name of the company to search for
|
||||
days_back: Number of days to look back for patents (default from config)
|
||||
|
||||
Returns:
|
||||
Patents object containing list of patents with PDF links
|
||||
@@ -23,13 +26,19 @@ class SERP:
|
||||
patents with restricted access). The returned count may be lower
|
||||
than the requested number of results.
|
||||
"""
|
||||
if days_back is None:
|
||||
days_back = config.patent_search_days
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=days_back)
|
||||
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
|
||||
|
||||
# Make API call
|
||||
params = {
|
||||
"engine": "google_patents",
|
||||
"q": company,
|
||||
"num": 10,
|
||||
"filter": 1,
|
||||
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
|
||||
"tbs": date_filter,
|
||||
"api_key": config.api_key,
|
||||
}
|
||||
search = serpapi.search(params)
|
||||
@@ -46,7 +55,7 @@ class SERP:
|
||||
|
||||
def save_patents(patent: Patent) -> Patent:
|
||||
"""
|
||||
Save the patent PDF to the patents folder
|
||||
Save the patent PDF to the patents folder, skipping download if already cached.
|
||||
|
||||
Args:
|
||||
patent: Patent object
|
||||
@@ -54,12 +63,15 @@ class SERP:
|
||||
Returns:
|
||||
Patent object with updated PDF path
|
||||
"""
|
||||
pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
os.makedirs("patents", exist_ok=True)
|
||||
|
||||
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
|
||||
response = requests.get(patent.pdf_link)
|
||||
print(patent.pdf_link)
|
||||
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
patent.pdf_path = pdf_path
|
||||
return patent
|
||||
|
||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||
|
||||
Reference in New Issue
Block a user