Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9c971dac72 | |||
| 6f0b448044 | |||
| 1a297eb60b | |||
| 3154f6b732 | |||
| b9bb3dc1cd | |||
| 90f9cfc826 | |||
| d387bbbdf3 | |||
| fa564e5e1e | |||
| 2815deb221 | |||
| 56e8287720 | |||
| b969423957 | |||
| 0dee4c5099 | |||
| 03105a2f87 | |||
| 28e2ded501 | |||
| f87572ab7e | |||
| 44b6c79713 | |||
| 13fe383116 | |||
| dee3cbefbd | |||
| 6acad4cff7 | |||
| 45ccd0b4e1 | |||
| d108d4c7ea | |||
| 068aecce61 | |||
| 8790abfbf7 | |||
| fe0c5ca280 | |||
| ed81ae4569 |
+82
-19
@@ -1,4 +1,4 @@
|
||||
name: Build and Push Docker Image
|
||||
name: Build and Push Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -9,7 +9,7 @@ on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
build-api:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
@@ -20,43 +20,36 @@ jobs:
|
||||
- name: Checkout code
|
||||
shell: sh
|
||||
run: |
|
||||
git clone https://gitea.leeworks.dev/${{ gitea.repository }}.git .
|
||||
git clone http://gitea.gitea.svc.cluster.local/${{ gitea.repository }}.git .
|
||||
git checkout ${{ gitea.sha }}
|
||||
|
||||
- name: Determine image tags
|
||||
id: tags
|
||||
shell: sh
|
||||
run: |
|
||||
REGISTRY="gitea.leeworks.dev"
|
||||
REGISTRY="gitea.gitea.svc.cluster.local:80"
|
||||
REPO_OWNER="${{ gitea.repository_owner }}"
|
||||
REPO_NAME="${{ gitea.repository }}"
|
||||
|
||||
# Extract repository name without owner
|
||||
REPO_NAME_ONLY=$(echo "$REPO_NAME" | cut -d'/' -f2)
|
||||
|
||||
# Convert to lowercase for Docker registry compatibility
|
||||
REPO_OWNER_LOWER=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]')
|
||||
REPO_NAME_LOWER=$(echo "$REPO_NAME_ONLY" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
# Base image path
|
||||
IMAGE_BASE="${REGISTRY}/${REPO_OWNER_LOWER}/${REPO_NAME_LOWER}"
|
||||
|
||||
# Determine tag based on ref
|
||||
case "${{ gitea.ref }}" in
|
||||
refs/tags/*)
|
||||
# Tag push - use the tag name
|
||||
TAG_NAME="${{ gitea.ref_name }}"
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:${TAG_NAME}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
refs/heads/main)
|
||||
# Main branch - use commit SHA (shortened to 7 chars) and latest
|
||||
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
|
||||
SHORT_SHA=$(echo "${{ gitea.sha }}" | cut -c1-7)
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:${SHORT_SHA}" >> $GITHUB_OUTPUT
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:${TIMESTAMP}-${SHORT_SHA}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
*)
|
||||
# Other branches - use branch name
|
||||
BRANCH_TAG=$(echo "${{ gitea.ref_name }}" | sed 's/\//-/g')
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:${BRANCH_TAG}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=false" >> $GITHUB_OUTPUT
|
||||
@@ -68,15 +61,15 @@ jobs:
|
||||
- name: Login to registry
|
||||
shell: sh
|
||||
run: |
|
||||
echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.leeworks.dev -u "${{ gitea.actor }}" --password-stdin
|
||||
echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.gitea.svc.cluster.local:80 -u "${{ gitea.actor }}" --password-stdin
|
||||
|
||||
- name: Build and push with Docker
|
||||
- name: Build and push API image
|
||||
shell: sh
|
||||
run: |
|
||||
echo "Building image..."
|
||||
echo "Building API image..."
|
||||
docker build -t ${{ steps.tags.outputs.IMAGE_TAG }} .
|
||||
|
||||
echo "Pushing image..."
|
||||
echo "Pushing API image..."
|
||||
docker push ${{ steps.tags.outputs.IMAGE_TAG }}
|
||||
|
||||
if [ "${{ steps.tags.outputs.PUSH_LATEST }}" = "true" ]; then
|
||||
@@ -85,5 +78,75 @@ jobs:
|
||||
docker push ${{ steps.tags.outputs.IMAGE_LATEST }}
|
||||
fi
|
||||
|
||||
echo "Build and push completed successfully!"
|
||||
echo "Image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
|
||||
echo "API image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
|
||||
|
||||
build-frontend:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
shell: sh
|
||||
run: |
|
||||
apk add --no-cache git docker-cli
|
||||
|
||||
- name: Checkout code
|
||||
shell: sh
|
||||
run: |
|
||||
git clone http://gitea.gitea.svc.cluster.local/${{ gitea.repository }}.git .
|
||||
git checkout ${{ gitea.sha }}
|
||||
|
||||
- name: Determine image tags
|
||||
id: tags
|
||||
shell: sh
|
||||
run: |
|
||||
REGISTRY="gitea.gitea.svc.cluster.local:80"
|
||||
REPO_OWNER="${{ gitea.repository_owner }}"
|
||||
REPO_NAME="${{ gitea.repository }}"
|
||||
|
||||
REPO_NAME_ONLY=$(echo "$REPO_NAME" | cut -d'/' -f2)
|
||||
REPO_OWNER_LOWER=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]')
|
||||
REPO_NAME_LOWER=$(echo "$REPO_NAME_ONLY" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
IMAGE_BASE="${REGISTRY}/${REPO_OWNER_LOWER}/${REPO_NAME_LOWER}"
|
||||
|
||||
case "${{ gitea.ref }}" in
|
||||
refs/tags/*)
|
||||
TAG_NAME="${{ gitea.ref_name }}"
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${TAG_NAME}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
refs/heads/main)
|
||||
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
|
||||
SHORT_SHA=$(echo "${{ gitea.sha }}" | cut -c1-7)
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${TIMESTAMP}-${SHORT_SHA}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
*)
|
||||
BRANCH_TAG=$(echo "${{ gitea.ref_name }}" | sed 's/\//-/g')
|
||||
echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${BRANCH_TAG}" >> $GITHUB_OUTPUT
|
||||
echo "PUSH_LATEST=false" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "IMAGE_LATEST=${IMAGE_BASE}:frontend-latest" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Login to registry
|
||||
shell: sh
|
||||
run: |
|
||||
echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.gitea.svc.cluster.local:80 -u "${{ gitea.actor }}" --password-stdin
|
||||
|
||||
- name: Build and push frontend image
|
||||
shell: sh
|
||||
run: |
|
||||
echo "Building frontend image..."
|
||||
docker build -t ${{ steps.tags.outputs.IMAGE_TAG }} ./frontend
|
||||
|
||||
echo "Pushing frontend image..."
|
||||
docker push ${{ steps.tags.outputs.IMAGE_TAG }}
|
||||
|
||||
if [ "${{ steps.tags.outputs.PUSH_LATEST }}" = "true" ]; then
|
||||
echo "Tagging and pushing frontend-latest..."
|
||||
docker tag ${{ steps.tags.outputs.IMAGE_TAG }} ${{ steps.tags.outputs.IMAGE_LATEST }}
|
||||
docker push ${{ steps.tags.outputs.IMAGE_LATEST }}
|
||||
fi
|
||||
|
||||
echo "Frontend image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
stages:
|
||||
- build
|
||||
|
||||
variables:
|
||||
DOCKER_DRIVER: overlay2
|
||||
DOCKER_TLS_CERTDIR: "/certs"
|
||||
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
|
||||
LATEST_TAG: $CI_REGISTRY_IMAGE:latest
|
||||
|
||||
build-and-push:
|
||||
stage: build
|
||||
image: docker:24-cli
|
||||
services:
|
||||
- docker:24-dind
|
||||
before_script:
|
||||
- echo "Logging into GitLab Container Registry..."
|
||||
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
|
||||
script:
|
||||
- echo "Building Docker image..."
|
||||
- docker build -t $IMAGE_TAG -t $LATEST_TAG .
|
||||
- echo "Pushing Docker image to registry..."
|
||||
- docker push $IMAGE_TAG
|
||||
- docker push $LATEST_TAG
|
||||
- echo "Build and push completed successfully!"
|
||||
- echo "Image available at $IMAGE_TAG"
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == "main"
|
||||
when: always
|
||||
- if: $CI_COMMIT_TAG
|
||||
when: always
|
||||
- when: manual
|
||||
tags:
|
||||
- docker
|
||||
@@ -17,7 +17,7 @@ SPARC automatically collects, parses, and analyzes patents from companies to pro
|
||||
- **Portfolio Analysis**: Evaluates multiple patents holistically for comprehensive insights
|
||||
- **Batch Processing**: Analyze multiple companies concurrently with progress tracking
|
||||
- **REST API**: FastAPI web service with async job support
|
||||
- **Dashboard**: Interactive Streamlit visualization dashboard
|
||||
- **Dashboard**: React TypeScript web dashboard with authentication
|
||||
- **Robust Testing**: 40 tests covering all major functionality
|
||||
|
||||
## Architecture
|
||||
@@ -27,7 +27,9 @@ SPARC/
|
||||
├── serp_api.py # Patent retrieval and PDF parsing
|
||||
├── llm.py # Claude AI integration via OpenRouter
|
||||
├── analyzer.py # High-level orchestration
|
||||
├── api.py # FastAPI web service
|
||||
├── api.py # FastAPI web service with auth endpoints
|
||||
├── auth.py # JWT authentication module
|
||||
├── database.py # PostgreSQL storage with caching
|
||||
├── types.py # Data models
|
||||
└── config.py # Environment configuration
|
||||
```
|
||||
@@ -48,7 +50,7 @@ docker-compose up -d
|
||||
|
||||
# Access the services
|
||||
# - API: http://localhost:8000
|
||||
# - Dashboard: http://localhost:8501
|
||||
# - Dashboard: http://localhost:8080
|
||||
# - API Docs: http://localhost:8000/docs
|
||||
```
|
||||
|
||||
@@ -186,21 +188,22 @@ curl -X POST http://localhost:8000/analyze/batch/async \
|
||||
-d '{"companies": ["nvidia", "amd", "intel", "qualcomm"]}'
|
||||
```
|
||||
|
||||
### Visualization Dashboard
|
||||
### Web Dashboard
|
||||
|
||||
Launch the interactive Streamlit dashboard:
|
||||
The React dashboard is included in Docker Compose:
|
||||
|
||||
```bash
|
||||
streamlit run dashboard.py
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
Dashboard features:
|
||||
- **Authentication**: User registration, login, and JWT-based sessions
|
||||
- **Company Analysis**: Analyze individual companies with real-time results
|
||||
- **Batch Analysis**: Process multiple companies with progress tracking and charts
|
||||
- **Analytics**: View historical analysis data and trends (requires database mode)
|
||||
- **System Status**: Monitor database and analyzer health
|
||||
- **Batch Analysis**: Process multiple companies with progress tracking
|
||||
- **Analytics**: View historical analysis data and trends
|
||||
- **Admin Panel**: User management for administrators
|
||||
|
||||
The dashboard runs at `http://localhost:8501` by default.
|
||||
The dashboard runs at `http://localhost:8080` when using Docker Compose.
|
||||
|
||||
## Running Tests
|
||||
|
||||
@@ -280,4 +283,4 @@ For open source projects, say how it is licensed.
|
||||
|
||||
Core functionality complete. Ready for production use with API keys configured.
|
||||
|
||||
All major features implemented: REST API, Streamlit dashboard, Docker containerization, database storage, and multi-company batch processing.
|
||||
All major features implemented: REST API, React dashboard with authentication, Docker containerization, database storage with caching, and multi-company batch processing.
|
||||
|
||||
+91
-29
@@ -4,26 +4,33 @@ This module ties together patent retrieval, parsing, and LLM analysis
|
||||
to provide company performance estimation based on patent portfolios.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Callable
|
||||
|
||||
from SPARC import config
|
||||
from SPARC.database import DatabaseClient
|
||||
from SPARC.serp_api import SERP
|
||||
from SPARC.llm import LLMAnalyzer
|
||||
from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult
|
||||
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
|
||||
|
||||
|
||||
class CompanyAnalyzer:
|
||||
"""Orchestrates end-to-end company performance analysis via patents."""
|
||||
|
||||
def __init__(self, openrouter_api_key: str | None = None):
|
||||
def __init__(self, openrouter_api_key: str | None = None, db_client: DatabaseClient | None = None):
|
||||
"""Initialize the company analyzer.
|
||||
|
||||
Args:
|
||||
openrouter_api_key: Optional OpenRouter API key. If None, loads from config.
|
||||
db_client: Optional DatabaseClient for patent caching. Created automatically if None.
|
||||
"""
|
||||
self.llm_analyzer = LLMAnalyzer(api_key=openrouter_api_key)
|
||||
self.db = db_client or DatabaseClient(config.database_url)
|
||||
self.db.connect()
|
||||
self.db.initialize_schema()
|
||||
|
||||
def analyze_company(self, company_name: str) -> str:
|
||||
def analyze_company(self, company_name: str, patents: "Patents | None" = None) -> str:
|
||||
"""Analyze a company's performance based on their patent portfolio.
|
||||
|
||||
This is the main entry point that orchestrates the full pipeline:
|
||||
@@ -35,40 +42,52 @@ class CompanyAnalyzer:
|
||||
|
||||
Args:
|
||||
company_name: Name of the company to analyze
|
||||
patents: Optional pre-fetched Patents result to avoid duplicate API calls
|
||||
|
||||
Returns:
|
||||
Comprehensive analysis of company's innovation and performance outlook
|
||||
"""
|
||||
print(f"Retrieving patents for {company_name}...")
|
||||
patents = SERP.query(company_name)
|
||||
if patents is None:
|
||||
# Check SERP query cache first
|
||||
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
|
||||
cached_ids = self.db.get_cached_serp_query(query_hash)
|
||||
if cached_ids is not None:
|
||||
print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)")
|
||||
patents = Patents(patents=[
|
||||
Patent(patent_id=pid, pdf_link="")
|
||||
for pid in cached_ids
|
||||
])
|
||||
else:
|
||||
print(f"Retrieving patents for {company_name}...")
|
||||
patents = SERP.query(company_name)
|
||||
# Cache the SERP results
|
||||
if patents.patents:
|
||||
self.db.store_serp_query(
|
||||
company_name=company_name,
|
||||
query_hash=query_hash,
|
||||
patent_ids=[p.patent_id for p in patents.patents],
|
||||
)
|
||||
|
||||
if not patents.patents:
|
||||
return f"No patents found for {company_name}"
|
||||
|
||||
print(f"Found {len(patents.patents)} patents. Processing...")
|
||||
|
||||
# Download and parse each patent
|
||||
# Download, parse, and minimize patents in parallel
|
||||
processed_patents = []
|
||||
for idx, patent in enumerate(patents.patents, 1):
|
||||
print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
|
||||
|
||||
try:
|
||||
# Download PDF
|
||||
patent = SERP.save_patents(patent)
|
||||
|
||||
# Parse sections from PDF
|
||||
sections = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
|
||||
# Minimize for LLM (remove bloat)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
|
||||
processed_patents.append(
|
||||
{"patent_id": patent.patent_id, "content": minimized_content}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
continue
|
||||
with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor:
|
||||
future_to_patent = {
|
||||
executor.submit(self._process_single_patent, patent, company_name, self.db): patent
|
||||
for patent in patents.patents
|
||||
}
|
||||
for future in as_completed(future_to_patent):
|
||||
patent = future_to_patent[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result:
|
||||
processed_patents.append(result)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
|
||||
if not processed_patents:
|
||||
return f"Failed to process any patents for {company_name}"
|
||||
@@ -113,6 +132,46 @@ class CompanyAnalyzer:
|
||||
except Exception as e:
|
||||
return f"Failed to analyze patent {patent_id}: {e}"
|
||||
|
||||
@staticmethod
|
||||
def _process_single_patent(
|
||||
patent: Patent,
|
||||
company_name: str = "",
|
||||
db: DatabaseClient | None = None,
|
||||
) -> dict | None:
|
||||
"""Download, parse, and minimize a single patent. Thread-safe.
|
||||
|
||||
Checks DB cache before downloading. Stores results after processing.
|
||||
|
||||
Returns:
|
||||
Dict with patent_id and minimized content, or None on failure.
|
||||
"""
|
||||
try:
|
||||
# Check DB cache first
|
||||
if db:
|
||||
cached = db.get_cached_patent(patent.patent_id)
|
||||
if cached and cached.get("minimized_content"):
|
||||
return {"patent_id": patent.patent_id, "content": cached["minimized_content"]}
|
||||
|
||||
# Full processing: download, parse, minimize
|
||||
patent = SERP.save_patents(patent)
|
||||
sections = SERP.parse_patent_pdf(patent.pdf_path)
|
||||
minimized_content = SERP.minimize_patent_for_llm(sections)
|
||||
|
||||
# Store in DB cache
|
||||
if db:
|
||||
db.store_patent(
|
||||
patent_id=patent.patent_id,
|
||||
company_name=company_name,
|
||||
pdf_link=patent.pdf_link,
|
||||
raw_sections=sections,
|
||||
minimized_content=minimized_content,
|
||||
)
|
||||
|
||||
return {"patent_id": patent.patent_id, "content": minimized_content}
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {patent.patent_id}: {e}")
|
||||
return None
|
||||
|
||||
def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
|
||||
"""Internal wrapper that catches exceptions and returns structured result.
|
||||
|
||||
@@ -123,11 +182,14 @@ class CompanyAnalyzer:
|
||||
CompanyAnalysisResult with success/failure status
|
||||
"""
|
||||
try:
|
||||
patents = SERP.query(company_name)
|
||||
patent_count = len(patents.patents) if patents.patents else 0
|
||||
|
||||
# Delegate to analyze_company which handles SERP/patent caching
|
||||
analysis = self.analyze_company(company_name)
|
||||
|
||||
# Determine patent count from cached SERP query
|
||||
query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
|
||||
cached_ids = self.db.get_cached_serp_query(query_hash)
|
||||
patent_count = len(cached_ids) if cached_ids else 0
|
||||
|
||||
# Check if analysis indicates failure
|
||||
if analysis.startswith("No patents found") or analysis.startswith(
|
||||
"Failed to process"
|
||||
|
||||
@@ -161,6 +161,7 @@ app = FastAPI(
|
||||
description="Semiconductor Patent & Analytics Report Core - Patent portfolio analysis using AI",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
root_path=config.root_path,
|
||||
)
|
||||
|
||||
# Add CORS middleware for React frontend
|
||||
|
||||
@@ -25,3 +25,11 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
|
||||
# Legacy compatibility - USE_DATABASE is deprecated, database is always used
|
||||
# This variable is kept for backwards compatibility but has no effect
|
||||
use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
|
||||
|
||||
# Patent search configuration
|
||||
patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
|
||||
patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
|
||||
|
||||
# Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
|
||||
# This ensures OpenAPI docs work correctly when accessed via the proxy
|
||||
root_path = os.getenv("ROOT_PATH", "")
|
||||
|
||||
+146
-4
@@ -1,9 +1,11 @@
|
||||
"""Database client for storing and retrieving LLM messages and user authentication."""
|
||||
|
||||
import contextlib
|
||||
import psycopg2
|
||||
from psycopg2.pool import ThreadedConnectionPool
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from typing import Dict, List, Optional
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import hashlib
|
||||
import bcrypt
|
||||
@@ -12,24 +14,49 @@ import bcrypt
|
||||
class DatabaseClient:
|
||||
"""Handles database operations for message storage and retrieval."""
|
||||
|
||||
def __init__(self, database_url: str):
|
||||
def __init__(self, database_url: str, minconn: int = 2, maxconn: int = 10):
|
||||
"""Initialize the database client.
|
||||
|
||||
Args:
|
||||
database_url: PostgreSQL connection string
|
||||
minconn: Minimum connections in the pool
|
||||
maxconn: Maximum connections in the pool
|
||||
"""
|
||||
self.database_url = database_url
|
||||
self._pool: ThreadedConnectionPool | None = None
|
||||
self._minconn = minconn
|
||||
self._maxconn = maxconn
|
||||
# Legacy single connection kept for backwards compatibility
|
||||
self.conn = None
|
||||
|
||||
def _ensure_pool(self):
|
||||
"""Create the connection pool if it doesn't exist yet."""
|
||||
if self._pool is None or self._pool.closed:
|
||||
self._pool = ThreadedConnectionPool(
|
||||
self._minconn, self._maxconn, self.database_url
|
||||
)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def get_conn(self):
|
||||
"""Check out a connection from the pool. Returns it on exit."""
|
||||
self._ensure_pool()
|
||||
conn = self._pool.getconn()
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
self._pool.putconn(conn)
|
||||
|
||||
def connect(self):
|
||||
"""Establish database connection."""
|
||||
"""Establish database connection (legacy single-connection path)."""
|
||||
if not self.conn or self.conn.closed:
|
||||
self.conn = psycopg2.connect(self.database_url)
|
||||
|
||||
def close(self):
|
||||
"""Close database connection."""
|
||||
"""Close database connection and pool."""
|
||||
if self.conn and not self.conn.closed:
|
||||
self.conn.close()
|
||||
if self._pool and not self._pool.closed:
|
||||
self._pool.closeall()
|
||||
|
||||
def initialize_schema(self):
|
||||
"""Create database tables if they don't exist."""
|
||||
@@ -110,6 +137,40 @@ class DatabaseClient:
|
||||
ON users(email)
|
||||
""")
|
||||
|
||||
# Create patents cache table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS patents (
|
||||
patent_id VARCHAR(64) PRIMARY KEY,
|
||||
company_name VARCHAR(255),
|
||||
pdf_link TEXT,
|
||||
raw_sections JSONB,
|
||||
minimized_content TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_patents_company
|
||||
ON patents(company_name)
|
||||
""")
|
||||
|
||||
# Create SERP query cache table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS serp_queries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
company_name VARCHAR(255),
|
||||
query_hash VARCHAR(64) UNIQUE,
|
||||
result_patent_ids TEXT[],
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_serp_queries_hash
|
||||
ON serp_queries(query_hash)
|
||||
""")
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
@staticmethod
|
||||
@@ -320,6 +381,87 @@ class DatabaseClient:
|
||||
"period_days": days,
|
||||
}
|
||||
|
||||
# Patent Cache Methods
|
||||
|
||||
def get_cached_patent(self, patent_id: str) -> Optional[Dict]:
|
||||
"""Look up a cached patent by ID.
|
||||
|
||||
Returns:
|
||||
Dict with raw_sections and minimized_content, or None.
|
||||
"""
|
||||
with self.get_conn() as conn:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
|
||||
cursor.execute(
|
||||
"SELECT * FROM patents WHERE patent_id = %s",
|
||||
(patent_id,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
def store_patent(
|
||||
self,
|
||||
patent_id: str,
|
||||
company_name: str,
|
||||
pdf_link: str,
|
||||
raw_sections: Dict,
|
||||
minimized_content: str,
|
||||
) -> None:
|
||||
"""Store a processed patent in the cache."""
|
||||
with self.get_conn() as conn:
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO patents (patent_id, company_name, pdf_link, raw_sections, minimized_content)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (patent_id) DO UPDATE SET
|
||||
raw_sections = EXCLUDED.raw_sections,
|
||||
minimized_content = EXCLUDED.minimized_content
|
||||
""",
|
||||
(patent_id, company_name, pdf_link, json.dumps(raw_sections), minimized_content),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def get_cached_serp_query(self, query_hash: str) -> Optional[List[str]]:
|
||||
"""Look up cached SERP query results.
|
||||
|
||||
Returns:
|
||||
List of patent IDs if cache hit and not expired, None otherwise.
|
||||
"""
|
||||
with self.get_conn() as conn:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT result_patent_ids FROM serp_queries
|
||||
WHERE query_hash = %s AND expires_at > NOW()
|
||||
""",
|
||||
(query_hash,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row["result_patent_ids"] if row else None
|
||||
|
||||
def store_serp_query(
|
||||
self,
|
||||
company_name: str,
|
||||
query_hash: str,
|
||||
patent_ids: List[str],
|
||||
ttl_hours: int = 24,
|
||||
) -> None:
|
||||
"""Store SERP query results in the cache."""
|
||||
expires_at = datetime.now() + timedelta(hours=ttl_hours)
|
||||
with self.get_conn() as conn:
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO serp_queries (company_name, query_hash, result_patent_ids, expires_at)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
ON CONFLICT (query_hash) DO UPDATE SET
|
||||
result_patent_ids = EXCLUDED.result_patent_ids,
|
||||
expires_at = EXCLUDED.expires_at
|
||||
""",
|
||||
(company_name, query_hash, patent_ids, expires_at),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# User Authentication Methods
|
||||
|
||||
@staticmethod
|
||||
|
||||
+22
-10
@@ -1,17 +1,20 @@
|
||||
import os
|
||||
import serpapi
|
||||
from SPARC import config
|
||||
import re
|
||||
import pdfplumber # pip install pdfplumber
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict
|
||||
from SPARC.types import Patents, Patent
|
||||
|
||||
class SERP:
|
||||
def query(company: str) -> Patents:
|
||||
def query(company: str, days_back: int = None) -> Patents:
|
||||
"""Query Google Patents for a company's recent patents.
|
||||
|
||||
Args:
|
||||
company: Name of the company to search for
|
||||
days_back: Number of days to look back for patents (default from config)
|
||||
|
||||
Returns:
|
||||
Patents object containing list of patents with PDF links
|
||||
@@ -23,13 +26,19 @@ class SERP:
|
||||
patents with restricted access). The returned count may be lower
|
||||
than the requested number of results.
|
||||
"""
|
||||
if days_back is None:
|
||||
days_back = config.patent_search_days
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=days_back)
|
||||
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
|
||||
|
||||
# Make API call
|
||||
params = {
|
||||
"engine": "google_patents",
|
||||
"q": company,
|
||||
"num": 10,
|
||||
"filter": 1,
|
||||
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
|
||||
"tbs": date_filter,
|
||||
"api_key": config.api_key,
|
||||
}
|
||||
search = serpapi.search(params)
|
||||
@@ -46,20 +55,23 @@ class SERP:
|
||||
|
||||
def save_patents(patent: Patent) -> Patent:
|
||||
"""
|
||||
Save the patent PDF to the patents folder
|
||||
|
||||
Save the patent PDF to the patents folder, skipping download if already cached.
|
||||
|
||||
Args:
|
||||
patent: Patent object
|
||||
|
||||
Returns:
|
||||
Patent object with updated PDF path
|
||||
"""
|
||||
response = requests.get(patent.pdf_link)
|
||||
print(patent.pdf_link)
|
||||
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
pdf_path = f"patents/{patent.patent_id}.pdf"
|
||||
os.makedirs("patents", exist_ok=True)
|
||||
|
||||
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
|
||||
response = requests.get(patent.pdf_link)
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
patent.pdf_path = pdf_path
|
||||
return patent
|
||||
|
||||
def parse_patent_pdf(pdf_path: str) -> Dict:
|
||||
|
||||
@@ -38,6 +38,7 @@ services:
|
||||
DATABASE_URL: postgresql://postgres:postgres@postgres:5432/sparc
|
||||
USE_CACHE: "true"
|
||||
JWT_SECRET: ${JWT_SECRET:-sparc-secret-key-change-in-production}
|
||||
ROOT_PATH: /api
|
||||
ports:
|
||||
- "8000:8000"
|
||||
depends_on:
|
||||
|
||||
+58
-45
@@ -1,16 +1,19 @@
|
||||
# Database Mode for Testing and Analytics
|
||||
# Database Storage and Caching
|
||||
|
||||
This document explains how to use SPARC's database mode for storing LLM messages for testing and analytics purposes.
|
||||
This document explains how SPARC uses PostgreSQL for storing LLM messages, enabling response caching and analytics.
|
||||
|
||||
## Overview
|
||||
|
||||
SPARC supports two modes of operation:
|
||||
SPARC stores all LLM interactions in PostgreSQL, providing:
|
||||
|
||||
1. **API Mode** (default): Messages are sent to OpenRouter's API and you receive real LLM responses
|
||||
2. **Database Mode**: Messages are stored in a PostgreSQL database without making API calls, useful for:
|
||||
- Testing the application without consuming API credits
|
||||
- Collecting analytics on message patterns and usage
|
||||
- Development and debugging
|
||||
- **Response Caching**: Avoid redundant API calls for previously analyzed patents
|
||||
- **Analytics**: Track usage patterns, token consumption, and analysis history
|
||||
- **Persistence**: Maintain analysis history across sessions
|
||||
|
||||
SPARC supports two cache modes:
|
||||
|
||||
1. **Cache Mode** (default, `USE_CACHE=true`): Check database for cached responses before making API calls
|
||||
2. **Fresh Mode** (`USE_CACHE=false`): Always make fresh API calls (still stores results in database)
|
||||
|
||||
## Setup
|
||||
|
||||
@@ -45,43 +48,43 @@ cp .env.example .env
|
||||
Edit `.env` and set:
|
||||
|
||||
```env
|
||||
# For database mode (testing/analytics)
|
||||
USE_DATABASE=true
|
||||
# Database connection (required)
|
||||
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/sparc
|
||||
|
||||
# For API mode (production)
|
||||
USE_DATABASE=false
|
||||
# Cache mode: use cached responses when available
|
||||
USE_CACHE=true
|
||||
|
||||
# API key for fresh LLM calls
|
||||
OPENROUTER_API_KEY=your_openrouter_key_here
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Running in Database Mode
|
||||
### Running with Cache Mode (Default)
|
||||
|
||||
Set `USE_DATABASE=true` in your `.env` file, then run the application normally:
|
||||
Set `USE_CACHE=true` in your `.env` file, then run the application normally:
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
Instead of sending messages to OpenRouter, the application will:
|
||||
- Store all prompts in the database
|
||||
- Return a placeholder response
|
||||
- Log metadata (company name, analysis type, timestamps)
|
||||
The application will:
|
||||
- Check the database for cached responses matching the request
|
||||
- If found, return the cached response (no API call)
|
||||
- If not found, make an API call and store the response for future use
|
||||
|
||||
### Running in API Mode
|
||||
### Running with Fresh Mode
|
||||
|
||||
Set `USE_DATABASE=false` in your `.env` file, then run the application normally:
|
||||
Set `USE_CACHE=false` in your `.env` file to always get fresh responses:
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
The application will send messages to OpenRouter and return real LLM responses.
|
||||
|
||||
### Hybrid Mode (Optional)
|
||||
|
||||
You can also enable database logging while still using the API by initializing the database client in your code. The `LLMAnalyzer` will automatically log all API calls to the database if a database client is available.
|
||||
The application will:
|
||||
- Always send messages to OpenRouter for real LLM responses
|
||||
- Store all responses in the database
|
||||
- Useful when you need the latest analysis or want to refresh cached data
|
||||
|
||||
## Viewing Analytics
|
||||
|
||||
@@ -195,16 +198,16 @@ docker-compose down -v
|
||||
|
||||
## Toggling Between Modes
|
||||
|
||||
You can easily switch between modes by changing the `USE_DATABASE` environment variable:
|
||||
You can easily switch between modes by changing the `USE_CACHE` environment variable:
|
||||
|
||||
### Quick Toggle (temporary, for testing)
|
||||
### Quick Toggle (temporary)
|
||||
|
||||
```bash
|
||||
# Run in database mode
|
||||
USE_DATABASE=true python main.py
|
||||
# Run with caching enabled
|
||||
USE_CACHE=true python main.py
|
||||
|
||||
# Run in API mode
|
||||
USE_DATABASE=false python main.py
|
||||
# Run with fresh API calls
|
||||
USE_CACHE=false python main.py
|
||||
```
|
||||
|
||||
### Persistent Toggle
|
||||
@@ -212,38 +215,48 @@ USE_DATABASE=false python main.py
|
||||
Edit your `.env` file:
|
||||
|
||||
```env
|
||||
# For testing/analytics
|
||||
USE_DATABASE=true
|
||||
# Use cached responses when available (recommended for most use)
|
||||
USE_CACHE=true
|
||||
|
||||
# For production use
|
||||
USE_DATABASE=false
|
||||
# Always make fresh API calls
|
||||
USE_CACHE=false
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Testing Without API Costs
|
||||
### Cost Optimization with Caching
|
||||
|
||||
During development, enable database mode to test the full application flow without consuming API credits:
|
||||
Cache mode reduces API costs by reusing previous analysis results:
|
||||
|
||||
```bash
|
||||
USE_DATABASE=true python main.py
|
||||
USE_CACHE=true python main.py
|
||||
```
|
||||
|
||||
If the same company/patent combination was analyzed before, the cached response is returned instantly.
|
||||
|
||||
### Fresh Analysis
|
||||
|
||||
When you need the latest LLM analysis (e.g., after model updates):
|
||||
|
||||
```bash
|
||||
USE_CACHE=false python main.py
|
||||
```
|
||||
|
||||
### Collecting Usage Analytics
|
||||
|
||||
Enable database mode in a test environment to collect analytics on:
|
||||
The database stores all interactions, enabling analytics on:
|
||||
- Which companies are analyzed most frequently
|
||||
- Types of analyses performed
|
||||
- Prompt patterns and lengths
|
||||
- Usage over time
|
||||
- Token usage and costs over time
|
||||
- Response caching hit rates
|
||||
|
||||
### Development and Debugging
|
||||
|
||||
Database mode is useful for:
|
||||
- Testing patent parsing logic without API calls
|
||||
Database storage is useful for:
|
||||
- Reviewing actual prompts sent to the LLM
|
||||
- Analyzing response patterns
|
||||
- Debugging the full pipeline end-to-end
|
||||
- Collecting sample prompts for optimization
|
||||
- Understanding token usage patterns (when in API mode with logging)
|
||||
- Understanding token usage patterns
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
|
||||
+24
-22
@@ -64,7 +64,7 @@ docker-compose ps
|
||||
# You should see:
|
||||
# - sparc-postgres (healthy)
|
||||
# - sparc-api (running on port 8000)
|
||||
# - sparc-dashboard (running on port 8501)
|
||||
# - sparc-dashboard (running on port 8080)
|
||||
```
|
||||
|
||||
The database is automatically initialized by the `init-db` service.
|
||||
@@ -116,11 +116,13 @@ docker-compose up -d postgres
|
||||
# Wait for database to be healthy, then initialize
|
||||
python scripts/init_database.py
|
||||
|
||||
# Terminal 1: Start FastAPI backend
|
||||
# Start FastAPI backend
|
||||
uvicorn SPARC.api:app --host 0.0.0.0 --port 8000 --reload
|
||||
|
||||
# Terminal 2: Start Streamlit dashboard
|
||||
streamlit run dashboard.py --server.port 8501 --server.address 0.0.0.0
|
||||
# For the React frontend (separate terminal)
|
||||
cd frontend
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
---
|
||||
@@ -141,7 +143,7 @@ Access the services:
|
||||
|---------|-----|
|
||||
| REST API | http://localhost:8000 |
|
||||
| API Documentation (Swagger) | http://localhost:8000/docs |
|
||||
| Dashboard (Web UI) | http://localhost:8501 |
|
||||
| Dashboard (Web UI) | http://localhost:8080 |
|
||||
|
||||
---
|
||||
|
||||
@@ -149,16 +151,17 @@ Access the services:
|
||||
|
||||
### Via Dashboard (Web UI)
|
||||
|
||||
1. Open http://localhost:8501
|
||||
2. Select **"Company Analysis"** from the sidebar
|
||||
3. Enter a company name (e.g., "Intel")
|
||||
4. Click **"Analyze"**
|
||||
1. Open http://localhost:8080
|
||||
2. Register a new account or login (default admin: `admin` / `admin`)
|
||||
3. Navigate to **"Analysis"** from the sidebar
|
||||
4. Enter a company name (e.g., "Intel")
|
||||
5. Click **"Analyze"**
|
||||
|
||||
This will:
|
||||
- Query SerpAPI for recent patents
|
||||
- Download and parse patent PDFs
|
||||
- Send patent content to Claude for analysis
|
||||
- Store prompt/response in PostgreSQL
|
||||
- Store prompt/response in PostgreSQL (with caching)
|
||||
- Display results in the dashboard
|
||||
|
||||
### Via REST API
|
||||
@@ -233,12 +236,12 @@ docker exec -it sparc-postgres psql -U postgres -d sparc -c \
|
||||
|
||||
| Component | Purpose |
|
||||
|-----------|---------|
|
||||
| **Dashboard** | Streamlit web UI for interactive analysis |
|
||||
| **FastAPI** | REST API for programmatic access |
|
||||
| **Dashboard** | React TypeScript web UI with authentication |
|
||||
| **FastAPI** | REST API with JWT authentication |
|
||||
| **Analyzer** | Orchestrates patent retrieval and LLM analysis |
|
||||
| **SerpAPI** | Retrieves patent data from Google Patents |
|
||||
| **OpenRouter** | Routes requests to Claude for AI analysis |
|
||||
| **PostgreSQL** | Stores prompts, responses, and analytics |
|
||||
| **PostgreSQL** | Stores prompts, responses, users, and cached results |
|
||||
|
||||
---
|
||||
|
||||
@@ -248,10 +251,9 @@ docker exec -it sparc-postgres psql -U postgres -d sparc -c \
|
||||
|----------|----------|---------|-------------|
|
||||
| `API_KEY` | Yes | - | SerpAPI key for patent search |
|
||||
| `OPENROUTER_API_KEY` | Yes | - | OpenRouter API key for Claude access |
|
||||
| `DATABASE_URL` | Yes* | - | PostgreSQL connection string |
|
||||
| `USE_DATABASE` | No | `false` | Set to `true` to enable database storage |
|
||||
|
||||
*Required when `USE_DATABASE=true`
|
||||
| `DATABASE_URL` | Yes | - | PostgreSQL connection string |
|
||||
| `USE_CACHE` | No | `true` | Check database for cached responses before API calls |
|
||||
| `JWT_SECRET` | Yes | - | Secret key for JWT authentication (change in production!) |
|
||||
|
||||
### Database URL Format
|
||||
|
||||
@@ -273,9 +275,9 @@ The `docker-compose.yml` includes all services needed for production:
|
||||
| Service | Container | Port | Description |
|
||||
|---------|-----------|------|-------------|
|
||||
| `postgres` | sparc-postgres | 5432 | PostgreSQL database |
|
||||
| `init-db` | sparc-init-db | - | One-time database initialization |
|
||||
| `api` | sparc-api | 8000 | FastAPI REST API |
|
||||
| `dashboard` | sparc-dashboard | 8501 | Streamlit web UI |
|
||||
| `init-db` | sparc-init-db | - | One-time database initialization (seeds admin user) |
|
||||
| `api` | sparc-api | 8000 | FastAPI REST API with JWT auth |
|
||||
| `dashboard` | sparc-dashboard | 8080 | React TypeScript web UI |
|
||||
|
||||
### Common Docker Compose Commands
|
||||
|
||||
@@ -382,11 +384,11 @@ cp .env.example .env
|
||||
docker-compose up -d postgres
|
||||
python scripts/init_database.py
|
||||
uvicorn SPARC.api:app --reload &
|
||||
streamlit run dashboard.py
|
||||
cd frontend && npm install && npm run dev &
|
||||
|
||||
# Check status
|
||||
curl http://localhost:8000/health
|
||||
open http://localhost:8501
|
||||
open http://localhost:8080
|
||||
|
||||
# View data
|
||||
python scripts/view_analytics.py
|
||||
|
||||
+5
-2
@@ -21,8 +21,11 @@ FROM nginx:alpine
|
||||
# Copy built files
|
||||
COPY --from=build /app/dist /usr/share/nginx/html
|
||||
|
||||
# Copy nginx config
|
||||
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
||||
# Copy nginx template (processed at startup with envsubst)
|
||||
COPY nginx.conf.template /etc/nginx/templates/default.conf.template
|
||||
|
||||
# Default API URL (override with -e API_URL=...)
|
||||
ENV API_URL=http://api:8000/
|
||||
|
||||
EXPOSE 80
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ server {
|
||||
|
||||
# Proxy API requests to backend
|
||||
location /api/ {
|
||||
proxy_pass http://api:8000/;
|
||||
proxy_pass ${API_URL}/;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection 'upgrade';
|
||||
+191
-3
@@ -1,11 +1,22 @@
|
||||
"""Tests for the high-level company analyzer orchestration."""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch, call
|
||||
from unittest.mock import Mock, patch, call, MagicMock
|
||||
from SPARC.analyzer import CompanyAnalyzer
|
||||
from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_db(mocker):
|
||||
"""Mock DatabaseClient for all tests so no real DB connection is needed."""
|
||||
mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient")
|
||||
mock_db_instance = MagicMock()
|
||||
mock_db_instance.get_cached_patent.return_value = None
|
||||
mock_db_instance.get_cached_serp_query.return_value = None
|
||||
mock_db_cls.return_value = mock_db_instance
|
||||
return mock_db_instance
|
||||
|
||||
|
||||
class TestCompanyAnalyzer:
|
||||
"""Test the CompanyAnalyzer orchestration logic."""
|
||||
|
||||
@@ -17,7 +28,7 @@ class TestCompanyAnalyzer:
|
||||
|
||||
mock_llm.assert_called_once_with(api_key="test-key")
|
||||
|
||||
def test_analyze_company_full_pipeline(self, mocker):
|
||||
def test_analyze_company_full_pipeline(self, mocker, mock_db):
|
||||
"""Test complete company analysis pipeline."""
|
||||
# Mock all the dependencies
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
@@ -178,6 +189,180 @@ class TestCompanyAnalyzer:
|
||||
assert "PDF not found" in result
|
||||
|
||||
|
||||
class TestSingleQueryBugFix:
|
||||
"""Test that SERP.query is only called once per company analysis."""
|
||||
|
||||
def test_analyze_company_safe_calls_query_once(self, mocker, mock_db):
|
||||
"""_analyze_company_safe should call SERP.query exactly once."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
mock_query.return_value = Patents(patents=[patent])
|
||||
|
||||
def save_side_effect(p):
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
|
||||
mock_save.side_effect = save_side_effect
|
||||
mock_parse.return_value = {"abstract": "Test"}
|
||||
mock_minimize.return_value = "Content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
analyzer._analyze_company_safe("TestCorp")
|
||||
|
||||
# The key assertion: SERP.query called exactly once, not twice
|
||||
mock_query.assert_called_once_with("TestCorp")
|
||||
|
||||
def test_analyze_company_with_prefetched_patents_skips_query(self, mocker):
|
||||
"""analyze_company should not call SERP.query when patents are provided."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
prefetched = Patents(patents=[patent])
|
||||
|
||||
def save_side_effect(p):
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
|
||||
mock_save.side_effect = save_side_effect
|
||||
mock_parse.return_value = {"abstract": "Test"}
|
||||
mock_minimize.return_value = "Content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
analyzer.analyze_company("TestCorp", patents=prefetched)
|
||||
|
||||
# SERP.query should never be called
|
||||
mock_query.assert_not_called()
|
||||
|
||||
|
||||
class TestPatentCaching:
|
||||
"""Test patent-level DB caching in the pipeline."""
|
||||
|
||||
def test_process_single_patent_uses_db_cache(self, mocker, mock_db):
|
||||
"""_process_single_patent returns cached content when available."""
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
|
||||
mock_db.get_cached_patent.return_value = {
|
||||
"patent_id": "US123",
|
||||
"minimized_content": "Cached minimized content",
|
||||
}
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
|
||||
|
||||
assert result == {"patent_id": "US123", "content": "Cached minimized content"}
|
||||
# Should NOT download since cache hit
|
||||
mock_save.assert_not_called()
|
||||
|
||||
def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db):
|
||||
"""_process_single_patent stores result in DB after processing."""
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
|
||||
# No cache hit
|
||||
mock_db.get_cached_patent.return_value = None
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
|
||||
def save_side_effect(p):
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
|
||||
mock_save.side_effect = save_side_effect
|
||||
mock_parse.return_value = {"abstract": "Test abstract"}
|
||||
mock_minimize.return_value = "Minimized content"
|
||||
|
||||
result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
|
||||
|
||||
assert result == {"patent_id": "US123", "content": "Minimized content"}
|
||||
mock_db.store_patent.assert_called_once_with(
|
||||
patent_id="US123",
|
||||
company_name="TestCorp",
|
||||
pdf_link="http://example.com/test.pdf",
|
||||
raw_sections={"abstract": "Test abstract"},
|
||||
minimized_content="Minimized content",
|
||||
)
|
||||
|
||||
def test_serp_query_cache_hit_skips_api(self, mocker, mock_db):
|
||||
"""When SERP query is cached, API call is skipped."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
# Simulate SERP cache hit
|
||||
mock_db.get_cached_serp_query.return_value = ["US123"]
|
||||
# Simulate patent cache hit too
|
||||
mock_db.get_cached_patent.return_value = {
|
||||
"patent_id": "US123",
|
||||
"minimized_content": "Cached content",
|
||||
}
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
result = analyzer.analyze_company("TestCorp")
|
||||
|
||||
assert result == "Analysis"
|
||||
# SERP.query should NOT be called
|
||||
mock_query.assert_not_called()
|
||||
# No downloads should happen
|
||||
mock_save.assert_not_called()
|
||||
|
||||
def test_serp_query_cache_miss_stores_result(self, mocker, mock_db):
|
||||
"""When SERP query cache misses, result is stored after API call."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
|
||||
mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
|
||||
mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
|
||||
|
||||
mock_db.get_cached_serp_query.return_value = None
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
mock_query.return_value = Patents(patents=[patent])
|
||||
|
||||
def save_side_effect(p):
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
|
||||
mock_save.side_effect = save_side_effect
|
||||
mock_parse.return_value = {"abstract": "Test"}
|
||||
mock_minimize.return_value = "Content"
|
||||
|
||||
mock_llm_instance = Mock()
|
||||
mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
|
||||
mock_llm.return_value = mock_llm_instance
|
||||
|
||||
analyzer = CompanyAnalyzer()
|
||||
analyzer.analyze_company("TestCorp")
|
||||
|
||||
mock_db.store_serp_query.assert_called_once()
|
||||
call_kwargs = mock_db.store_serp_query.call_args[1]
|
||||
assert call_kwargs["company_name"] == "TestCorp"
|
||||
assert call_kwargs["patent_ids"] == ["US123"]
|
||||
|
||||
|
||||
class TestBatchProcessing:
|
||||
"""Test multi-company batch processing functionality."""
|
||||
|
||||
@@ -316,7 +501,7 @@ class TestBatchProcessing:
|
||||
|
||||
assert callback.call_count == 2
|
||||
|
||||
def test_company_analysis_result_structure(self, mocker):
|
||||
def test_company_analysis_result_structure(self, mocker, mock_db):
|
||||
"""Test CompanyAnalysisResult has correct structure."""
|
||||
mock_query = mocker.patch("SPARC.analyzer.SERP.query")
|
||||
mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
|
||||
@@ -327,6 +512,9 @@ class TestBatchProcessing:
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
mock_query.return_value = Patents(patents=[patent])
|
||||
|
||||
# Simulate DB caching: after store, subsequent get returns the IDs
|
||||
mock_db.get_cached_serp_query.side_effect = [None, ["US123"]]
|
||||
|
||||
def save_side_effect(p):
|
||||
p.pdf_path = "patents/US123.pdf"
|
||||
return p
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
"""Tests for SERP API patent retrieval and parsing functionality."""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
from unittest.mock import patch, Mock
|
||||
from datetime import datetime, timedelta
|
||||
from SPARC.serp_api import SERP
|
||||
from SPARC.types import Patent
|
||||
|
||||
|
||||
class TestTextCleaning:
|
||||
@@ -176,3 +180,89 @@ class TestPatentMinimization:
|
||||
|
||||
# Sections should be separated by double newlines
|
||||
assert "\n\n" in result
|
||||
|
||||
|
||||
class TestDynamicDateRange:
|
||||
"""Test dynamic date range computation in SERP.query."""
|
||||
|
||||
def test_query_uses_rolling_date_window(self, mocker):
|
||||
"""Verify the date filter uses a rolling window, not hardcoded dates."""
|
||||
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
||||
mock_search.return_value = {"organic_results": []}
|
||||
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
||||
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
||||
|
||||
SERP.query("TestCorp")
|
||||
|
||||
call_params = mock_search.call_args[0][0]
|
||||
tbs = call_params["tbs"]
|
||||
# Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
|
||||
assert "cdr:1,cd_min:" in tbs
|
||||
assert "10/28/2025" not in tbs # old hardcoded date gone
|
||||
|
||||
def test_query_respects_days_back_param(self, mocker):
|
||||
"""Verify days_back parameter controls the date window."""
|
||||
mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
|
||||
mock_search.return_value = {"organic_results": []}
|
||||
mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
|
||||
mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
|
||||
|
||||
now = datetime.now()
|
||||
SERP.query("TestCorp", days_back=30)
|
||||
|
||||
call_params = mock_search.call_args[0][0]
|
||||
tbs = call_params["tbs"]
|
||||
expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
|
||||
assert expected_start in tbs
|
||||
|
||||
|
||||
class TestFilesystemPDFCaching:
|
||||
"""Test that save_patents skips download for existing files."""
|
||||
|
||||
def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
|
||||
"""Already-downloaded PDFs should not be re-downloaded."""
|
||||
mock_get = mocker.patch("SPARC.serp_api.requests.get")
|
||||
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||
|
||||
pdf_path = tmp_path / "US123.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
||||
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
|
||||
|
||||
patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
|
||||
result = SERP.save_patents(patent)
|
||||
|
||||
mock_get.assert_not_called()
|
||||
assert result.pdf_path == "patents/US123.pdf"
|
||||
|
||||
def test_save_patents_downloads_when_not_cached(self, mocker):
|
||||
"""Missing PDFs should be downloaded."""
|
||||
mock_response = Mock()
|
||||
mock_response.content = b"%PDF-1.4 content"
|
||||
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
||||
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||
mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
|
||||
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
||||
|
||||
patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
|
||||
result = SERP.save_patents(patent)
|
||||
|
||||
mock_get.assert_called_once_with("http://example.com/test.pdf")
|
||||
assert result.pdf_path == "patents/US456.pdf"
|
||||
|
||||
def test_save_patents_redownloads_empty_files(self, mocker):
|
||||
"""Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
|
||||
mock_response = Mock()
|
||||
mock_response.content = b"%PDF-1.4 content"
|
||||
mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
|
||||
mocker.patch("SPARC.serp_api.os.makedirs")
|
||||
mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
|
||||
mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
|
||||
mock_open = mocker.patch("builtins.open", mocker.mock_open())
|
||||
|
||||
patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
|
||||
result = SERP.save_patents(patent)
|
||||
|
||||
mock_get.assert_called_once()
|
||||
assert result.pdf_path == "patents/US789.pdf"
|
||||
|
||||
Reference in New Issue
Block a user