fix(serp): replace hardcoded date range with rolling window
The SERP query had a frozen date range (Oct-Nov 2025) that returned stale patents. Now computes a rolling window from config (PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF caching to skip re-downloading existing patent PDFs, and adds PATENT_THREAD_WORKERS config for upcoming parallel processing.
This commit is contained in:
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
|
|||||||
# This variable is kept for backwards compatibility but has no effect
|
# This variable is kept for backwards compatibility but has no effect
|
||||||
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
|
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
|
||||||
|
|
||||||
|
# Patent search configuration
|
||||||
|
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
|
||||||
|
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
|
||||||
|
|
||||||
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
|
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
|
||||||
# This ensures OpenAPI docs work correctly when accessed via the proxy
|
# This ensures OpenAPI docs work correctly when accessed via the proxy
|
||||||
root_path = os.getenv("ROOT_PATH", "")
|
root_path = os.getenv("ROOT_PATH", "")
|
||||||
|
|||||||
+18
-6
@@ -1,17 +1,20 @@
|
|||||||
|
import os
|
||||||
import serpapi
|
import serpapi
|
||||||
from SPARC import config
|
from SPARC import config
|
||||||
import re
|
import re
|
||||||
import pdfplumber # pip install pdfplumber
|
import pdfplumber # pip install pdfplumber
|
||||||
import requests
|
import requests
|
||||||
|
from datetime import datetime, timedelta
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from SPARC.types import Patents, Patent
|
from SPARC.types import Patents, Patent
|
||||||
|
|
||||||
class SERP:
|
class SERP:
|
||||||
def query(company: str) -> Patents:
|
def query(company: str, days_back: int = None) -> Patents:
|
||||||
"""Query Google Patents for a company's recent patents.
|
"""Query Google Patents for a company's recent patents.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
company: Name of the company to search for
|
company: Name of the company to search for
|
||||||
|
days_back: Number of days to look back for patents (default from config)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Patents object containing list of patents with PDF links
|
Patents object containing list of patents with PDF links
|
||||||
@@ -23,13 +26,19 @@ class SERP:
|
|||||||
patents with restricted access). The returned count may be lower
|
patents with restricted access). The returned count may be lower
|
||||||
than the requested number of results.
|
than the requested number of results.
|
||||||
"""
|
"""
|
||||||
|
if days_back is None:
|
||||||
|
days_back = config.patent_search_days
|
||||||
|
end_date = datetime.now()
|
||||||
|
start_date = end_date - timedelta(days=days_back)
|
||||||
|
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
|
||||||
|
|
||||||
# Make API call
|
# Make API call
|
||||||
params = {
|
params = {
|
||||||
"engine": "google_patents",
|
"engine": "google_patents",
|
||||||
"q": company,
|
"q": company,
|
||||||
"num": 10,
|
"num": 10,
|
||||||
"filter": 1,
|
"filter": 1,
|
||||||
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
|
"tbs": date_filter,
|
||||||
"api_key": config.api_key,
|
"api_key": config.api_key,
|
||||||
}
|
}
|
||||||
search = serpapi.search(params)
|
search = serpapi.search(params)
|
||||||
@@ -46,7 +55,7 @@ class SERP:
|
|||||||
|
|
||||||
def save_patents(patent: Patent) -> Patent:
|
def save_patents(patent: Patent) -> Patent:
|
||||||
"""
|
"""
|
||||||
Save the patent PDF to the patents folder
|
Save the patent PDF to the patents folder, skipping download if already cached.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
patent: Patent object
|
patent: Patent object
|
||||||
@@ -54,12 +63,15 @@ class SERP:
|
|||||||
Returns:
|
Returns:
|
||||||
Patent object with updated PDF path
|
Patent object with updated PDF path
|
||||||
"""
|
"""
|
||||||
|
pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||||
|
os.makedirs("patents", exist_ok=True)
|
||||||
|
|
||||||
|
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
|
||||||
response = requests.get(patent.pdf_link)
|
response = requests.get(patent.pdf_link)
|
||||||
print(patent.pdf_link)
|
with open(pdf_path, "wb") as f:
|
||||||
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
|
||||||
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
patent.pdf_path = pdf_path
|
||||||
return patent
|
return patent
|
||||||
|
|
||||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user