Compare commits

...

7 Commits

Author SHA1 Message Date
0xWheatyz 9c971dac72 fix(analyzer): route _analyze_company_safe through cache-aware path
Build and Push Docker Images / build-api (push) Successful in 2m19s
Build and Push Docker Images / build-frontend (push) Successful in 1m49s
_analyze_company_safe was calling SERP.query directly, bypassing the
SERP query cache in analyze_company. Now delegates fully to
analyze_company() and reads patent_count from the serp_queries cache.
2026-03-24 15:02:19 -04:00
0xWheatyz 6f0b448044 test(analyzer,serp): add tests for caching, single query, and parallel processing
- Add TestSingleQueryBugFix: verify SERP.query called once per analysis
- Add TestPatentCaching: DB cache hit/miss, SERP query cache hit/miss
- Add TestDynamicDateRange: rolling window, days_back param
- Add TestFilesystemPDFCaching: skip download, redownload empty files
- Add autouse mock_db fixture to prevent real DB connections in all tests
2026-03-24 14:39:09 -04:00
0xWheatyz 1a297eb60b feat(analyzer): integrate DB patent and SERP query caching
Before querying SERP API, check serp_queries cache (24h TTL). Before
downloading/parsing each patent, check patents table for cached
minimized_content. Store results after processing so repeated analyses
skip all network I/O and PDF parsing entirely.
2026-03-24 14:35:24 -04:00
0xWheatyz 3154f6b732 feat(database): add patent/serp caching tables and connection pooling
- Add patents table (patent_id PK, raw_sections JSONB, minimized_content)
- Add serp_queries table (query_hash unique, result_patent_ids, expires_at)
- Add cache methods: get/store_patent, get/store_serp_query
- Replace single connection with ThreadedConnectionPool (min=2, max=10)
- Add get_conn() context manager for thread-safe connection checkout
- Legacy single-connection path preserved for backwards compatibility
2026-03-24 14:34:33 -04:00
0xWheatyz b9bb3dc1cd perf(analyzer): parallelize patent download/parse/minimize with threads
Replace the sequential per-patent loop with a ThreadPoolExecutor
(workers controlled by PATENT_THREAD_WORKERS config). Each patent is
processed independently in _process_single_patent, which is thread-safe
since SERP methods are stateless and operate on separate files.
2026-03-24 14:32:23 -04:00
0xWheatyz 90f9cfc826 fix(serp): replace hardcoded date range with rolling window
The SERP query had a frozen date range (Oct-Nov 2025) that returned
stale patents. Now computes a rolling window from config
(PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF
caching to skip re-downloading existing patent PDFs, and adds
PATENT_THREAD_WORKERS config for upcoming parallel processing.
2026-03-24 14:31:43 -04:00
0xWheatyz d387bbbdf3 fix(analyzer): eliminate double SERP.query() call per company analysis
_analyze_company_safe called SERP.query() then passed the company name
to analyze_company() which called SERP.query() again — doubling API
usage. Now analyze_company() accepts an optional patents param so callers
can pass pre-fetched results through.
2026-03-24 14:16:49 -04:00
6 changed files with 544 additions and 46 deletions
+91 -29
View File
@@ -4,26 +4,33 @@ This module ties together patent retrieval, parsing, and LLM analysis
to provide company performance estimation based on patent portfolios.
"""
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable
from SPARC import config
from SPARC.database import DatabaseClient
from SPARC.serp_api import SERP
from SPARC.llm import LLMAnalyzer
from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
class CompanyAnalyzer:
"""Orchestrates end-to-end company performance analysis via patents."""
def __init__(self, openrouter_api_key: str | None = None):
def __init__(self, openrouter_api_key: str | None = None, db_client: DatabaseClient | None = None):
"""Initialize the company analyzer.
Args:
openrouter_api_key: Optional OpenRouter API key. If None, loads from config.
db_client: Optional DatabaseClient for patent caching. Created automatically if None.
"""
self.llm_analyzer = LLMAnalyzer(api_key=openrouter_api_key)
self.db = db_client or DatabaseClient(config.database_url)
self.db.connect()
self.db.initialize_schema()
def analyze_company(self, company_name: str) -> str:
def analyze_company(self, company_name: str, patents: "Patents | None" = None) -> str:
"""Analyze a company's performance based on their patent portfolio.
This is the main entry point that orchestrates the full pipeline:
@@ -35,40 +42,52 @@ class CompanyAnalyzer:
Args:
company_name: Name of the company to analyze
patents: Optional pre-fetched Patents result to avoid duplicate API calls
Returns:
Comprehensive analysis of company's innovation and performance outlook
"""
print(f"Retrieving patents for {company_name}...")
patents = SERP.query(company_name)
if patents is None:
# Check SERP query cache first
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
cached_ids = self.db.get_cached_serp_query(query_hash)
if cached_ids is not None:
print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)")
patents = Patents(patents=[
Patent(patent_id=pid, pdf_link="")
for pid in cached_ids
])
else:
print(f"Retrieving patents for {company_name}...")
patents = SERP.query(company_name)
# Cache the SERP results
if patents.patents:
self.db.store_serp_query(
company_name=company_name,
query_hash=query_hash,
patent_ids=[p.patent_id for p in patents.patents],
)
if not patents.patents:
return f"No patents found for {company_name}"
print(f"Found {len(patents.patents)} patents. Processing...")
# Download and parse each patent
# Download, parse, and minimize patents in parallel
processed_patents = []
for idx, patent in enumerate(patents.patents, 1):
print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
try:
# Download PDF
patent = SERP.save_patents(patent)
# Parse sections from PDF
sections = SERP.parse_patent_pdf(patent.pdf_path)
# Minimize for LLM (remove bloat)
minimized_content = SERP.minimize_patent_for_llm(sections)
processed_patents.append(
{"patent_id": patent.patent_id, "content": minimized_content}
)
except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}")
continue
with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor:
future_to_patent = {
executor.submit(self._process_single_patent, patent, company_name, self.db): patent
for patent in patents.patents
}
for future in as_completed(future_to_patent):
patent = future_to_patent[future]
try:
result = future.result()
if result:
processed_patents.append(result)
except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}")
if not processed_patents:
return f"Failed to process any patents for {company_name}"
@@ -113,6 +132,46 @@ class CompanyAnalyzer:
except Exception as e:
return f"Failed to analyze patent {patent_id}: {e}"
@staticmethod
def _process_single_patent(
patent: Patent,
company_name: str = "",
db: DatabaseClient | None = None,
) -> dict | None:
"""Download, parse, and minimize a single patent. Thread-safe.
Checks DB cache before downloading. Stores results after processing.
Returns:
Dict with patent_id and minimized content, or None on failure.
"""
try:
# Check DB cache first
if db:
cached = db.get_cached_patent(patent.patent_id)
if cached and cached.get("minimized_content"):
return {"patent_id": patent.patent_id, "content": cached["minimized_content"]}
# Full processing: download, parse, minimize
patent = SERP.save_patents(patent)
sections = SERP.parse_patent_pdf(patent.pdf_path)
minimized_content = SERP.minimize_patent_for_llm(sections)
# Store in DB cache
if db:
db.store_patent(
patent_id=patent.patent_id,
company_name=company_name,
pdf_link=patent.pdf_link,
raw_sections=sections,
minimized_content=minimized_content,
)
return {"patent_id": patent.patent_id, "content": minimized_content}
except Exception as e:
print(f"Warning: Failed to process {patent.patent_id}: {e}")
return None
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
"""Internal wrapper that catches exceptions and returns structured result.
@@ -123,11 +182,14 @@ class CompanyAnalyzer:
CompanyAnalysisResult with success/failure status
"""
try:
patents = SERP.query(company_name)
patent_count = len(patents.patents) if patents.patents else 0
# Delegate to analyze_company which handles SERP/patent caching
analysis = self.analyze_company(company_name)
# Determine patent count from cached SERP query
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
cached_ids = self.db.get_cached_serp_query(query_hash)
patent_count = len(cached_ids) if cached_ids else 0
# Check if analysis indicates failure
if analysis.startswith("No patents found") or analysis.startswith(
"Failed to process"
+4
View File
@@ -26,6 +26,10 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
# This variable is kept for backwards compatibility but has no effect
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
# Patent search configuration
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
# This ensures OpenAPI docs work correctly when accessed via the proxy
root_path = os.getenv("ROOT_PATH", "")
+146 -4
View File
@@ -1,9 +1,11 @@
"""Database client for storing and retrieving LLM messages and user authentication."""
import contextlib
import psycopg2
from psycopg2.pool import ThreadedConnectionPool
from psycopg2.extras import RealDictCursor
from typing import Dict, List, Optional
from datetime import datetime
from datetime import datetime, timedelta
import json
import hashlib
import bcrypt
@@ -12,24 +14,49 @@ import bcrypt
class DatabaseClient:
"""Handles database operations for message storage and retrieval."""
def __init__(self, database_url: str):
def __init__(self, database_url: str, minconn: int = 2, maxconn: int = 10):
"""Initialize the database client.
Args:
database_url: PostgreSQL connection string
minconn: Minimum connections in the pool
maxconn: Maximum connections in the pool
"""
self.database_url = database_url
self._pool: ThreadedConnectionPool | None = None
self._minconn = minconn
self._maxconn = maxconn
# Legacy single connection kept for backwards compatibility
self.conn = None
def _ensure_pool(self):
"""Create the connection pool if it doesn't exist yet."""
if self._pool is None or self._pool.closed:
self._pool = ThreadedConnectionPool(
self._minconn, self._maxconn, self.database_url
)
@contextlib.contextmanager
def get_conn(self):
"""Check out a connection from the pool. Returns it on exit."""
self._ensure_pool()
conn = self._pool.getconn()
try:
yield conn
finally:
self._pool.putconn(conn)
def connect(self):
"""Establish database connection."""
"""Establish database connection (legacy single-connection path)."""
if not self.conn or self.conn.closed:
self.conn = psycopg2.connect(self.database_url)
def close(self):
"""Close database connection."""
"""Close database connection and pool."""
if self.conn and not self.conn.closed:
self.conn.close()
if self._pool and not self._pool.closed:
self._pool.closeall()
def initialize_schema(self):
"""Create database tables if they don't exist."""
@@ -110,6 +137,40 @@ class DatabaseClient:
ON users(email)
""")
# Create patents cache table
cursor.execute("""
CREATE TABLE IF NOT EXISTS patents (
patent_id VARCHAR(64) PRIMARY KEY,
company_name VARCHAR(255),
pdf_link TEXT,
raw_sections JSONB,
minimized_content TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_patents_company
ON patents(company_name)
""")
# Create SERP query cache table
cursor.execute("""
CREATE TABLE IF NOT EXISTS serp_queries (
id SERIAL PRIMARY KEY,
company_name VARCHAR(255),
query_hash VARCHAR(64) UNIQUE,
result_patent_ids TEXT[],
expires_at TIMESTAMP NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_serp_queries_hash
ON serp_queries(query_hash)
""")
self.conn.commit()
@staticmethod
@@ -320,6 +381,87 @@ class DatabaseClient:
"period_days": days,
}
# Patent Cache Methods
def get_cached_patent(self, patent_id: str) -> Optional[Dict]:
"""Look up a cached patent by ID.
Returns:
Dict with raw_sections and minimized_content, or None.
"""
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(
"SELECT * FROM patents WHERE patent_id = %s",
(patent_id,),
)
row = cursor.fetchone()
return dict(row) if row else None
def store_patent(
self,
patent_id: str,
company_name: str,
pdf_link: str,
raw_sections: Dict,
minimized_content: str,
) -> None:
"""Store a processed patent in the cache."""
with self.get_conn() as conn:
with conn.cursor() as cursor:
cursor.execute(
"""
INSERT INTO patents (patent_id, company_name, pdf_link, raw_sections, minimized_content)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (patent_id) DO UPDATE SET
raw_sections = EXCLUDED.raw_sections,
minimized_content = EXCLUDED.minimized_content
""",
(patent_id, company_name, pdf_link, json.dumps(raw_sections), minimized_content),
)
conn.commit()
def get_cached_serp_query(self, query_hash: str) -> Optional[List[str]]:
"""Look up cached SERP query results.
Returns:
List of patent IDs if cache hit and not expired, None otherwise.
"""
with self.get_conn() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(
"""
SELECT result_patent_ids FROM serp_queries
WHERE query_hash = %s AND expires_at > NOW()
""",
(query_hash,),
)
row = cursor.fetchone()
return row["result_patent_ids"] if row else None
def store_serp_query(
self,
company_name: str,
query_hash: str,
patent_ids: List[str],
ttl_hours: int = 24,
) -> None:
"""Store SERP query results in the cache."""
expires_at = datetime.now() + timedelta(hours=ttl_hours)
with self.get_conn() as conn:
with conn.cursor() as cursor:
cursor.execute(
"""
INSERT INTO serp_queries (company_name, query_hash, result_patent_ids, expires_at)
VALUES (%s, %s, %s, %s)
ON CONFLICT (query_hash) DO UPDATE SET
result_patent_ids = EXCLUDED.result_patent_ids,
expires_at = EXCLUDED.expires_at
""",
(company_name, query_hash, patent_ids, expires_at),
)
conn.commit()
# User Authentication Methods
@staticmethod
+22 -10
View File
@@ -1,17 +1,20 @@
import os
import serpapi
from SPARC import config
import re
import pdfplumber # pip install pdfplumber
import requests
from datetime import datetime, timedelta
from typing import Dict
from SPARC.types import Patents, Patent
class SERP:
def query(company: str) -> Patents:
def query(company: str, days_back: int = None) -> Patents:
"""Query Google Patents for a company's recent patents.
Args:
company: Name of the company to search for
days_back: Number of days to look back for patents (default from config)
Returns:
Patents object containing list of patents with PDF links
@@ -23,13 +26,19 @@ class SERP:
patents with restricted access). The returned count may be lower
than the requested number of results.
"""
if days_back is None:
days_back = config.patent_search_days
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
# Make API call
params = {
"engine": "google_patents",
"q": company,
"num": 10,
"filter": 1,
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
"tbs": date_filter,
"api_key": config.api_key,
}
search = serpapi.search(params)
@@ -46,20 +55,23 @@ class SERP:
def save_patents(patent: Patent) -> Patent:
"""
Save the patent PDF to the patents folder
Save the patent PDF to the patents folder, skipping download if already cached.
Args:
patent: Patent object
Returns:
Patent object with updated PDF path
"""
response = requests.get(patent.pdf_link)
print(patent.pdf_link)
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
f.write(response.content)
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
pdf_path = f"patents/{patent.patent_id}.pdf"
os.makedirs("patents", exist_ok=True)
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
response = requests.get(patent.pdf_link)
with open(pdf_path, "wb") as f:
f.write(response.content)
patent.pdf_path = pdf_path
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
+191 -3
View File
@@ -1,11 +1,22 @@
"""Tests for the high-level company analyzer orchestration."""
import pytest
from unittest.mock import Mock, patch, call
from unittest.mock import Mock, patch, call, MagicMock
from SPARC.analyzer import CompanyAnalyzer
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
@pytest.fixture(autouse=True)
def mock_db(mocker):
"""Mock DatabaseClient for all tests so no real DB connection is needed."""
mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient")
mock_db_instance = MagicMock()
mock_db_instance.get_cached_patent.return_value = None
mock_db_instance.get_cached_serp_query.return_value = None
mock_db_cls.return_value = mock_db_instance
return mock_db_instance
class TestCompanyAnalyzer:
"""Test the CompanyAnalyzer orchestration logic."""
@@ -17,7 +28,7 @@ class TestCompanyAnalyzer:
mock_llm.assert_called_once_with(api_key="test-key")
def test_analyze_company_full_pipeline(self, mocker):
def test_analyze_company_full_pipeline(self, mocker, mock_db):
"""Test complete company analysis pipeline."""
# Mock all the dependencies
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
@@ -178,6 +189,180 @@ class TestCompanyAnalyzer:
assert "PDF not found" in result
class TestSingleQueryBugFix:
"""Test that SERP.query is only called once per company analysis."""
def test_analyze_company_safe_calls_query_once(self, mocker, mock_db):
"""_analyze_company_safe should call SERP.query exactly once."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer._analyze_company_safe("TestCorp")
# The key assertion: SERP.query called exactly once, not twice
mock_query.assert_called_once_with("TestCorp")
def test_analyze_company_with_prefetched_patents_skips_query(self, mocker):
"""analyze_company should not call SERP.query when patents are provided."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
prefetched = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer.analyze_company("TestCorp", patents=prefetched)
# SERP.query should never be called
mock_query.assert_not_called()
class TestPatentCaching:
"""Test patent-level DB caching in the pipeline."""
def test_process_single_patent_uses_db_cache(self, mocker, mock_db):
"""_process_single_patent returns cached content when available."""
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_db.get_cached_patent.return_value = {
"patent_id": "US123",
"minimized_content": "Cached minimized content",
}
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
assert result == {"patent_id": "US123", "content": "Cached minimized content"}
# Should NOT download since cache hit
mock_save.assert_not_called()
def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db):
"""_process_single_patent stores result in DB after processing."""
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
# No cache hit
mock_db.get_cached_patent.return_value = None
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test abstract"}
mock_minimize.return_value = "Minimized content"
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
assert result == {"patent_id": "US123", "content": "Minimized content"}
mock_db.store_patent.assert_called_once_with(
patent_id="US123",
company_name="TestCorp",
pdf_link="http://example.com/test.pdf",
raw_sections={"abstract": "Test abstract"},
minimized_content="Minimized content",
)
def test_serp_query_cache_hit_skips_api(self, mocker, mock_db):
"""When SERP query is cached, API call is skipped."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
# Simulate SERP cache hit
mock_db.get_cached_serp_query.return_value = ["US123"]
# Simulate patent cache hit too
mock_db.get_cached_patent.return_value = {
"patent_id": "US123",
"minimized_content": "Cached content",
}
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
result = analyzer.analyze_company("TestCorp")
assert result == "Analysis"
# SERP.query should NOT be called
mock_query.assert_not_called()
# No downloads should happen
mock_save.assert_not_called()
def test_serp_query_cache_miss_stores_result(self, mocker, mock_db):
"""When SERP query cache misses, result is stored after API call."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
mock_db.get_cached_serp_query.return_value = None
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
mock_save.side_effect = save_side_effect
mock_parse.return_value = {"abstract": "Test"}
mock_minimize.return_value = "Content"
mock_llm_instance = Mock()
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
mock_llm.return_value = mock_llm_instance
analyzer = CompanyAnalyzer()
analyzer.analyze_company("TestCorp")
mock_db.store_serp_query.assert_called_once()
call_kwargs = mock_db.store_serp_query.call_args[1]
assert call_kwargs["company_name"] == "TestCorp"
assert call_kwargs["patent_ids"] == ["US123"]
class TestBatchProcessing:
"""Test multi-company batch processing functionality."""
@@ -316,7 +501,7 @@ class TestBatchProcessing:
assert callback.call_count == 2
def test_company_analysis_result_structure(self, mocker):
def test_company_analysis_result_structure(self, mocker, mock_db):
"""Test CompanyAnalysisResult has correct structure."""
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
@@ -327,6 +512,9 @@ class TestBatchProcessing:
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
mock_query.return_value = Patents(patents=[patent])
# Simulate DB caching: after store, subsequent get returns the IDs
mock_db.get_cached_serp_query.side_effect = [None, ["US123"]]
def save_side_effect(p):
p.pdf_path = "patents/US123.pdf"
return p
+90
View File
@@ -1,7 +1,11 @@
"""Tests for SERP API patent retrieval and parsing functionality."""
import os
import pytest
from unittest.mock import patch, Mock
from datetime import datetime, timedelta
from SPARC.serp_api import SERP
from SPARC.types import Patent
class TestTextCleaning:
@@ -176,3 +180,89 @@ class TestPatentMinimization:
# Sections should be separated by double newlines
assert "\n\n" in result
class TestDynamicDateRange:
"""Test dynamic date range computation in SERP.query."""
def test_query_uses_rolling_date_window(self, mocker):
"""Verify the date filter uses a rolling window, not hardcoded dates."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
SERP.query("TestCorp")
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
assert "cdr:1,cd_min:" in tbs
assert "10/28/2025" not in tbs # old hardcoded date gone
def test_query_respects_days_back_param(self, mocker):
"""Verify days_back parameter controls the date window."""
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
mock_search.return_value = {"organic_results": []}
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
now = datetime.now()
SERP.query("TestCorp", days_back=30)
call_params = mock_search.call_args[0][0]
tbs = call_params["tbs"]
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
assert expected_start in tbs
class TestFilesystemPDFCaching:
"""Test that save_patents skips download for existing files."""
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
"""Already-downloaded PDFs should not be re-downloaded."""
mock_get = mocker.patch("SPARC.serp_api.requests.get")
mocker.patch("SPARC.serp_api.os.makedirs")
pdf_path = tmp_path / "US123.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_not_called()
assert result.pdf_path == "patents/US123.pdf"
def test_save_patents_downloads_when_not_cached(self, mocker):
"""Missing PDFs should be downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once_with("http://example.com/test.pdf")
assert result.pdf_path == "patents/US456.pdf"
def test_save_patents_redownloads_empty_files(self, mocker):
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
mock_response = Mock()
mock_response.content = b"%PDF-1.4 content"
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
mocker.patch("SPARC.serp_api.os.makedirs")
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
mock_open = mocker.patch("builtins.open", mocker.mock_open())
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
result = SERP.save_patents(patent)
mock_get.assert_called_once()
assert result.pdf_path == "patents/US789.pdf"