fix(analyzer): route _analyze_company_safe through cache-aware path

_analyze_company_safe was calling SERP.query directly, bypassing the SERP query cache in analyze_company. Now delegates fully to analyze_company() and reads patent_count from the serp_queries cache.
test(analyzer,serp): add tests for caching, single query, and parallel processing
2026-03-24 15:02:19 -04:00 · 2026-03-24 14:39:09 -04:00 · 2026-03-24 14:35:24 -04:00 · 2026-03-24 14:34:33 -04:00 · 2026-03-24 14:32:23 -04:00 · 2026-03-24 14:31:43 -04:00
16 changed files with 734 additions and 179 deletions
@@ -1,4 +1,4 @@
-name: Build and Push Docker Image
+name: Build and Push Docker Images
 on:
  push:
@@ -9,7 +9,7 @@ on:
  workflow_dispatch:
 jobs:
-  build-and-push:
+  build-api:
    runs-on: ubuntu-latest
    steps:
      - name: Install dependencies
@@ -20,43 +20,36 @@ jobs:
      - name: Checkout code
        shell: sh
        run: |
-          git clone https://gitea.leeworks.dev/${{ gitea.repository }}.git .
+          git clone http://gitea.gitea.svc.cluster.local/${{ gitea.repository }}.git .
          git checkout ${{ gitea.sha }}
      - name: Determine image tags
        id: tags
        shell: sh
        run: |
-          REGISTRY="gitea.leeworks.dev"
+          REGISTRY="gitea.gitea.svc.cluster.local:80"
          REPO_OWNER="${{ gitea.repository_owner }}"
          REPO_NAME="${{ gitea.repository }}"
          # Extract repository name without owner
          REPO_NAME_ONLY=$(echo "$REPO_NAME" | cut -d'/' -f2)
          # Convert to lowercase for Docker registry compatibility
          REPO_OWNER_LOWER=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]')
          REPO_NAME_LOWER=$(echo "$REPO_NAME_ONLY" | tr '[:upper:]' '[:lower:]')
          # Base image path
          IMAGE_BASE="${REGISTRY}/${REPO_OWNER_LOWER}/${REPO_NAME_LOWER}"
          # Determine tag based on ref
          case "${{ gitea.ref }}" in
            refs/tags/*)
              # Tag push - use the tag name
              TAG_NAME="${{ gitea.ref_name }}"
              echo "IMAGE_TAG=${IMAGE_BASE}:${TAG_NAME}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
              ;;
            refs/heads/main)
-              # Main branch - use commit SHA (shortened to 7 chars) and latest
+              TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
              SHORT_SHA=$(echo "${{ gitea.sha }}" | cut -c1-7)
-              echo "IMAGE_TAG=${IMAGE_BASE}:${SHORT_SHA}" >> $GITHUB_OUTPUT
+              echo "IMAGE_TAG=${IMAGE_BASE}:${TIMESTAMP}-${SHORT_SHA}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
              ;;
            *)
              # Other branches - use branch name
              BRANCH_TAG=$(echo "${{ gitea.ref_name }}" | sed 's/\//-/g')
              echo "IMAGE_TAG=${IMAGE_BASE}:${BRANCH_TAG}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=false" >> $GITHUB_OUTPUT
@@ -68,15 +61,15 @@ jobs:
      - name: Login to registry
        shell: sh
        run: |
-          echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.leeworks.dev -u "${{ gitea.actor }}" --password-stdin
+          echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.gitea.svc.cluster.local:80 -u "${{ gitea.actor }}" --password-stdin
-      - name: Build and push with Docker
+      - name: Build and push API image
        shell: sh
        run: |
-          echo "Building image..."
+          echo "Building API image..."
          docker build -t ${{ steps.tags.outputs.IMAGE_TAG }} .
-          echo "Pushing image..."
+          echo "Pushing API image..."
          docker push ${{ steps.tags.outputs.IMAGE_TAG }}
          if [ "${{ steps.tags.outputs.PUSH_LATEST }}" = "true" ]; then
@@ -85,5 +78,75 @@ jobs:
            docker push ${{ steps.tags.outputs.IMAGE_LATEST }}
          fi
-          echo "Build and push completed successfully!"
+          echo "API image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
-          echo "Image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
+
  build-frontend:
    runs-on: ubuntu-latest
    steps:
      - name: Install dependencies
        shell: sh
        run: |
          apk add --no-cache git docker-cli
      - name: Checkout code
        shell: sh
        run: |
          git clone http://gitea.gitea.svc.cluster.local/${{ gitea.repository }}.git .
          git checkout ${{ gitea.sha }}
      - name: Determine image tags
        id: tags
        shell: sh
        run: |
          REGISTRY="gitea.gitea.svc.cluster.local:80"
          REPO_OWNER="${{ gitea.repository_owner }}"
          REPO_NAME="${{ gitea.repository }}"
          REPO_NAME_ONLY=$(echo "$REPO_NAME" | cut -d'/' -f2)
          REPO_OWNER_LOWER=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]')
          REPO_NAME_LOWER=$(echo "$REPO_NAME_ONLY" | tr '[:upper:]' '[:lower:]')
          IMAGE_BASE="${REGISTRY}/${REPO_OWNER_LOWER}/${REPO_NAME_LOWER}"
          case "${{ gitea.ref }}" in
            refs/tags/*)
              TAG_NAME="${{ gitea.ref_name }}"
              echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${TAG_NAME}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
              ;;
            refs/heads/main)
              TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
              SHORT_SHA=$(echo "${{ gitea.sha }}" | cut -c1-7)
              echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${TIMESTAMP}-${SHORT_SHA}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=true" >> $GITHUB_OUTPUT
              ;;
            *)
              BRANCH_TAG=$(echo "${{ gitea.ref_name }}" | sed 's/\//-/g')
              echo "IMAGE_TAG=${IMAGE_BASE}:frontend-${BRANCH_TAG}" >> $GITHUB_OUTPUT
              echo "PUSH_LATEST=false" >> $GITHUB_OUTPUT
              ;;
          esac
          echo "IMAGE_LATEST=${IMAGE_BASE}:frontend-latest" >> $GITHUB_OUTPUT
      - name: Login to registry
        shell: sh
        run: |
          echo "${{ secrets.PERSONAL_TOKEN }}" | docker login gitea.gitea.svc.cluster.local:80 -u "${{ gitea.actor }}" --password-stdin
      - name: Build and push frontend image
        shell: sh
        run: |
          echo "Building frontend image..."
          docker build -t ${{ steps.tags.outputs.IMAGE_TAG }} ./frontend
          echo "Pushing frontend image..."
          docker push ${{ steps.tags.outputs.IMAGE_TAG }}
          if [ "${{ steps.tags.outputs.PUSH_LATEST }}" = "true" ]; then
            echo "Tagging and pushing frontend-latest..."
            docker tag ${{ steps.tags.outputs.IMAGE_TAG }} ${{ steps.tags.outputs.IMAGE_LATEST }}
            docker push ${{ steps.tags.outputs.IMAGE_LATEST }}
          fi
          echo "Frontend image available at ${{ steps.tags.outputs.IMAGE_TAG }}"
@@ -1,33 +0,0 @@
 stages:
  - build
 variables:
  DOCKER_DRIVER: overlay2
  DOCKER_TLS_CERTDIR: "/certs"
  IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
  LATEST_TAG: $CI_REGISTRY_IMAGE:latest
 build-and-push:
  stage: build
  image: docker:24-cli
  services:
    - docker:24-dind
  before_script:
    - echo "Logging into GitLab Container Registry..."
    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
  script:
    - echo "Building Docker image..."
    - docker build -t $IMAGE_TAG -t $LATEST_TAG .
    - echo "Pushing Docker image to registry..."
    - docker push $IMAGE_TAG
    - docker push $LATEST_TAG
    - echo "Build and push completed successfully!"
    - echo "Image available at $IMAGE_TAG"
  rules:
    - if: $CI_COMMIT_BRANCH == "main"
      when: always
    - if: $CI_COMMIT_TAG
      when: always
    - when: manual
  tags:
    - docker
@@ -17,7 +17,7 @@ SPARC automatically collects, parses, and analyzes patents from companies to pro
 - **Portfolio Analysis**: Evaluates multiple patents holistically for comprehensive insights
 - **Batch Processing**: Analyze multiple companies concurrently with progress tracking
 - **REST API**: FastAPI web service with async job support
- **Dashboard**: Interactive Streamlit visualization dashboard
+- **Dashboard**: React TypeScript web dashboard with authentication
 - **Robust Testing**: 40 tests covering all major functionality
 ## Architecture
@@ -27,7 +27,9 @@ SPARC/
 ├── serp_api.py       # Patent retrieval and PDF parsing
 ├── llm.py            # Claude AI integration via OpenRouter
 ├── analyzer.py       # High-level orchestration
-├── api.py            # FastAPI web service
+├── api.py            # FastAPI web service with auth endpoints
 ├── auth.py           # JWT authentication module
 ├── database.py       # PostgreSQL storage with caching
 ├── types.py          # Data models
 └── config.py         # Environment configuration
 ```
@@ -48,7 +50,7 @@ docker-compose up -d
 # Access the services
 # - API: http://localhost:8000
-# - Dashboard: http://localhost:8501
+# - Dashboard: http://localhost:8080
 # - API Docs: http://localhost:8000/docs
 ```
@@ -186,21 +188,22 @@ curl -X POST http://localhost:8000/analyze/batch/async \
  -d '{"companies": ["nvidia", "amd", "intel", "qualcomm"]}'
 ```
-### Visualization Dashboard
+### Web Dashboard
-Launch the interactive Streamlit dashboard:
+The React dashboard is included in Docker Compose:
 ```bash
-streamlit run dashboard.py
+docker-compose up -d
 ```
 Dashboard features:
 - **Authentication**: User registration, login, and JWT-based sessions
 - **Company Analysis**: Analyze individual companies with real-time results
- **Batch Analysis**: Process multiple companies with progress tracking and charts
+- **Batch Analysis**: Process multiple companies with progress tracking
- **Analytics**: View historical analysis data and trends (requires database mode)
+- **Analytics**: View historical analysis data and trends
- **System Status**: Monitor database and analyzer health
+- **Admin Panel**: User management for administrators
-The dashboard runs at `http://localhost:8501` by default.
+The dashboard runs at `http://localhost:8080` when using Docker Compose.
 ## Running Tests
@@ -280,4 +283,4 @@ For open source projects, say how it is licensed.
 Core functionality complete. Ready for production use with API keys configured.
-All major features implemented: REST API, Streamlit dashboard, Docker containerization, database storage, and multi-company batch processing.
+All major features implemented: REST API, React dashboard with authentication, Docker containerization, database storage with caching, and multi-company batch processing.
@@ -4,26 +4,33 @@ This module ties together patent retrieval, parsing, and LLM analysis
 to provide company performance estimation based on patent portfolios.
 """
 import hashlib
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Callable
 from SPARC import config
 from SPARC.database import DatabaseClient
 from SPARC.serp_api import SERP
 from SPARC.llm import LLMAnalyzer
-from SPARC.types import Patent, CompanyAnalysisResult, BatchAnalysisResult
+from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
 class CompanyAnalyzer:
    """Orchestrates end-to-end company performance analysis via patents."""
-    def __init__(self, openrouter_api_key: str | None = None):
+    def __init__(self, openrouter_api_key: str | None = None, db_client: DatabaseClient | None = None):
        """Initialize the company analyzer.
        Args:
          openrouter_api_key: Optional OpenRouter API key. If None, loads from config.
          db_client: Optional DatabaseClient for patent caching. Created automatically if None.
        """
        self.llm_analyzer = LLMAnalyzer(api_key=openrouter_api_key)
        self.db = db_client or DatabaseClient(config.database_url)
        self.db.connect()
        self.db.initialize_schema()
-    def analyze_company(self, company_name: str) -> str:
+    def analyze_company(self, company_name: str, patents: "Patents | None" = None) -> str:
        """Analyze a company's performance based on their patent portfolio.
        This is the main entry point that orchestrates the full pipeline:
@@ -35,40 +42,52 @@ class CompanyAnalyzer:
        Args:
          company_name: Name of the company to analyze
          patents: Optional pre-fetched Patents result to avoid duplicate API calls
        Returns:
          Comprehensive analysis of company's innovation and performance outlook
        """
-        print(f"Retrieving patents for {company_name}...")
+        if patents is None:
-        patents = SERP.query(company_name)
+            # Check SERP query cache first
            query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
            cached_ids = self.db.get_cached_serp_query(query_hash)
            if cached_ids is not None:
                print(f"Using cached SERP results for {company_name} ({len(cached_ids)} patents)")
                patents = Patents(patents=[
                    Patent(patent_id=pid, pdf_link="")
                    for pid in cached_ids
                ])
            else:
                print(f"Retrieving patents for {company_name}...")
                patents = SERP.query(company_name)
                # Cache the SERP results
                if patents.patents:
                    self.db.store_serp_query(
                        company_name=company_name,
                        query_hash=query_hash,
                        patent_ids=[p.patent_id for p in patents.patents],
                    )
        if not patents.patents:
            return f"No patents found for {company_name}"
        print(f"Found {len(patents.patents)} patents. Processing...")
-        # Download and parse each patent
+        # Download, parse, and minimize patents in parallel
        processed_patents = []
-        for idx, patent in enumerate(patents.patents, 1):
+        with ThreadPoolExecutor(max_workers=config.patent_thread_workers) as executor:
-            print(f"Processing patent {idx}/{len(patents.patents)}: {patent.patent_id}")
+            future_to_patent = {
-
+                executor.submit(self._process_single_patent, patent, company_name, self.db): patent
-            try:
+                for patent in patents.patents
-                # Download PDF
+            }
-                patent = SERP.save_patents(patent)
+            for future in as_completed(future_to_patent):
-
+                patent = future_to_patent[future]
-                # Parse sections from PDF
+                try:
-                sections = SERP.parse_patent_pdf(patent.pdf_path)
+                    result = future.result()
-
+                    if result:
-                # Minimize for LLM (remove bloat)
+                        processed_patents.append(result)
-                minimized_content = SERP.minimize_patent_for_llm(sections)
+                except Exception as e:
-
+                    print(f"Warning: Failed to process {patent.patent_id}: {e}")
                processed_patents.append(
                    {"patent_id": patent.patent_id, "content": minimized_content}
                )
            except Exception as e:
                print(f"Warning: Failed to process {patent.patent_id}: {e}")
                continue
        if not processed_patents:
            return f"Failed to process any patents for {company_name}"
@@ -113,6 +132,46 @@ class CompanyAnalyzer:
        except Exception as e:
            return f"Failed to analyze patent {patent_id}: {e}"
    @staticmethod
    def _process_single_patent(
        patent: Patent,
        company_name: str = "",
        db: DatabaseClient | None = None,
    ) -> dict | None:
        """Download, parse, and minimize a single patent. Thread-safe.
        Checks DB cache before downloading. Stores results after processing.
        Returns:
            Dict with patent_id and minimized content, or None on failure.
        """
        try:
            # Check DB cache first
            if db:
                cached = db.get_cached_patent(patent.patent_id)
                if cached and cached.get("minimized_content"):
                    return {"patent_id": patent.patent_id, "content": cached["minimized_content"]}
            # Full processing: download, parse, minimize
            patent = SERP.save_patents(patent)
            sections = SERP.parse_patent_pdf(patent.pdf_path)
            minimized_content = SERP.minimize_patent_for_llm(sections)
            # Store in DB cache
            if db:
                db.store_patent(
                    patent_id=patent.patent_id,
                    company_name=company_name,
                    pdf_link=patent.pdf_link,
                    raw_sections=sections,
                    minimized_content=minimized_content,
                )
            return {"patent_id": patent.patent_id, "content": minimized_content}
        except Exception as e:
            print(f"Warning: Failed to process {patent.patent_id}: {e}")
            return None
    def _analyze_company_safe(self, company_name: str) -> CompanyAnalysisResult:
        """Internal wrapper that catches exceptions and returns structured result.
@@ -123,11 +182,14 @@ class CompanyAnalyzer:
            CompanyAnalysisResult with success/failure status
        """
        try:
-            patents = SERP.query(company_name)
+            # Delegate to analyze_company which handles SERP/patent caching
            patent_count = len(patents.patents) if patents.patents else 0
            analysis = self.analyze_company(company_name)
            # Determine patent count from cached SERP query
            query_hash = hashlib.sha256(company_name.lower().encode()).hexdigest()
            cached_ids = self.db.get_cached_serp_query(query_hash)
            patent_count = len(cached_ids) if cached_ids else 0
            # Check if analysis indicates failure
            if analysis.startswith("No patents found") or analysis.startswith(
                "Failed to process"
@@ -161,6 +161,7 @@ app = FastAPI(
    description="Semiconductor Patent & Analytics Report Core - Patent portfolio analysis using AI",
    version="1.0.0",
    lifespan=lifespan,
    root_path=config.root_path,
 )
 # Add CORS middleware for React frontend
@@ -25,3 +25,11 @@ use_cache = os.getenv("USE_CACHE", "true").lower() in ("true", "1", "yes")
 # Legacy compatibility - USE_DATABASE is deprecated, database is always used
 # This variable is kept for backwards compatibility but has no effect
 use_database = os.getenv("USE_DATABASE", "false").lower() in ("true", "1", "yes")
 # Patent search configuration
 patent_search_days = int(os.getenv("PATENT_SEARCH_DAYS", "90"))
 patent_thread_workers = int(os.getenv("PATENT_THREAD_WORKERS", "5"))
 # Root path for running behind a reverse proxy (e.g., "/api" when served at /api/)
 # This ensures OpenAPI docs work correctly when accessed via the proxy
 root_path = os.getenv("ROOT_PATH", "")
@@ -1,9 +1,11 @@
 """Database client for storing and retrieving LLM messages and user authentication."""
 import contextlib
 import psycopg2
 from psycopg2.pool import ThreadedConnectionPool
 from psycopg2.extras import RealDictCursor
 from typing import Dict, List, Optional
-from datetime import datetime
+from datetime import datetime, timedelta
 import json
 import hashlib
 import bcrypt
@@ -12,24 +14,49 @@ import bcrypt
 class DatabaseClient:
    """Handles database operations for message storage and retrieval."""
-    def __init__(self, database_url: str):
+    def __init__(self, database_url: str, minconn: int = 2, maxconn: int = 10):
        """Initialize the database client.
        Args:
            database_url: PostgreSQL connection string
            minconn: Minimum connections in the pool
            maxconn: Maximum connections in the pool
        """
        self.database_url = database_url
        self._pool: ThreadedConnectionPool | None = None
        self._minconn = minconn
        self._maxconn = maxconn
        # Legacy single connection kept for backwards compatibility
        self.conn = None
    def _ensure_pool(self):
        """Create the connection pool if it doesn't exist yet."""
        if self._pool is None or self._pool.closed:
            self._pool = ThreadedConnectionPool(
                self._minconn, self._maxconn, self.database_url
            )
    @contextlib.contextmanager
    def get_conn(self):
        """Check out a connection from the pool. Returns it on exit."""
        self._ensure_pool()
        conn = self._pool.getconn()
        try:
            yield conn
        finally:
            self._pool.putconn(conn)
    def connect(self):
-        """Establish database connection."""
+        """Establish database connection (legacy single-connection path)."""
        if not self.conn or self.conn.closed:
            self.conn = psycopg2.connect(self.database_url)
    def close(self):
-        """Close database connection."""
+        """Close database connection and pool."""
        if self.conn and not self.conn.closed:
            self.conn.close()
        if self._pool and not self._pool.closed:
            self._pool.closeall()
    def initialize_schema(self):
        """Create database tables if they don't exist."""
@@ -110,6 +137,40 @@ class DatabaseClient:
                ON users(email)
            """)
            # Create patents cache table
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS patents (
                    patent_id VARCHAR(64) PRIMARY KEY,
                    company_name VARCHAR(255),
                    pdf_link TEXT,
                    raw_sections JSONB,
                    minimized_content TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            cursor.execute("""
                CREATE INDEX IF NOT EXISTS idx_patents_company
                ON patents(company_name)
            """)
            # Create SERP query cache table
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS serp_queries (
                    id SERIAL PRIMARY KEY,
                    company_name VARCHAR(255),
                    query_hash VARCHAR(64) UNIQUE,
                    result_patent_ids TEXT[],
                    expires_at TIMESTAMP NOT NULL,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            cursor.execute("""
                CREATE INDEX IF NOT EXISTS idx_serp_queries_hash
                ON serp_queries(query_hash)
            """)
            self.conn.commit()
    @staticmethod
@@ -320,6 +381,87 @@ class DatabaseClient:
                "period_days": days,
            }
    # Patent Cache Methods
    def get_cached_patent(self, patent_id: str) -> Optional[Dict]:
        """Look up a cached patent by ID.
        Returns:
            Dict with raw_sections and minimized_content, or None.
        """
        with self.get_conn() as conn:
            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
                cursor.execute(
                    "SELECT * FROM patents WHERE patent_id = %s",
                    (patent_id,),
                )
                row = cursor.fetchone()
                return dict(row) if row else None
    def store_patent(
        self,
        patent_id: str,
        company_name: str,
        pdf_link: str,
        raw_sections: Dict,
        minimized_content: str,
    ) -> None:
        """Store a processed patent in the cache."""
        with self.get_conn() as conn:
            with conn.cursor() as cursor:
                cursor.execute(
                    """
                    INSERT INTO patents (patent_id, company_name, pdf_link, raw_sections, minimized_content)
                    VALUES (%s, %s, %s, %s, %s)
                    ON CONFLICT (patent_id) DO UPDATE SET
                        raw_sections = EXCLUDED.raw_sections,
                        minimized_content = EXCLUDED.minimized_content
                    """,
                    (patent_id, company_name, pdf_link, json.dumps(raw_sections), minimized_content),
                )
            conn.commit()
    def get_cached_serp_query(self, query_hash: str) -> Optional[List[str]]:
        """Look up cached SERP query results.
        Returns:
            List of patent IDs if cache hit and not expired, None otherwise.
        """
        with self.get_conn() as conn:
            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
                cursor.execute(
                    """
                    SELECT result_patent_ids FROM serp_queries
                    WHERE query_hash = %s AND expires_at > NOW()
                    """,
                    (query_hash,),
                )
                row = cursor.fetchone()
                return row["result_patent_ids"] if row else None
    def store_serp_query(
        self,
        company_name: str,
        query_hash: str,
        patent_ids: List[str],
        ttl_hours: int = 24,
    ) -> None:
        """Store SERP query results in the cache."""
        expires_at = datetime.now() + timedelta(hours=ttl_hours)
        with self.get_conn() as conn:
            with conn.cursor() as cursor:
                cursor.execute(
                    """
                    INSERT INTO serp_queries (company_name, query_hash, result_patent_ids, expires_at)
                    VALUES (%s, %s, %s, %s)
                    ON CONFLICT (query_hash) DO UPDATE SET
                        result_patent_ids = EXCLUDED.result_patent_ids,
                        expires_at = EXCLUDED.expires_at
                    """,
                    (company_name, query_hash, patent_ids, expires_at),
                )
            conn.commit()
    # User Authentication Methods
    @staticmethod
@@ -1,17 +1,20 @@
 import os
 import serpapi
 from SPARC import config
 import re
 import pdfplumber  # pip install pdfplumber
 import requests
 from datetime import datetime, timedelta
 from typing import Dict
 from SPARC.types import Patents, Patent
 class SERP:
-  def query(company: str) -> Patents:
+  def query(company: str, days_back: int = None) -> Patents:
    """Query Google Patents for a company's recent patents.
    Args:
        company: Name of the company to search for
        days_back: Number of days to look back for patents (default from config)
    Returns:
        Patents object containing list of patents with PDF links
@@ -23,13 +26,19 @@ class SERP:
        patents with restricted access). The returned count may be lower
        than the requested number of results.
    """
    if days_back is None:
        days_back = config.patent_search_days
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
    # Make API call
    params = {
      "engine": "google_patents",
      "q": company,
      "num": 10,
      "filter": 1,
-      "tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
+      "tbs": date_filter,
      "api_key": config.api_key,
    }
    search = serpapi.search(params)
@@ -46,7 +55,7 @@ class SERP:
  def save_patents(patent: Patent) -> Patent:
    """
-    Save the patent PDF to the patents folder
+    Save the patent PDF to the patents folder, skipping download if already cached.
    Args:
      patent: Patent object
@@ -54,12 +63,15 @@ class SERP:
    Returns:
      Patent object with updated PDF path
    """
-    response = requests.get(patent.pdf_link)
+    pdf_path = f"patents/{patent.patent_id}.pdf"
-    print(patent.pdf_link)
+    os.makedirs("patents", exist_ok=True)
    with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
      f.write(response.content)
-    patent.pdf_path = f"patents/{patent.patent_id}.pdf"
+    if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
      response = requests.get(patent.pdf_link)
      with open(pdf_path, "wb") as f:
        f.write(response.content)
    patent.pdf_path = pdf_path
    return patent
  def parse_patent_pdf(pdf_path: str) -> Dict:
@@ -38,6 +38,7 @@ services:
      DATABASE_URL: postgresql://postgres:postgres@postgres:5432/sparc
      USE_CACHE: "true"
      JWT_SECRET: ${JWT_SECRET:-sparc-secret-key-change-in-production}
      ROOT_PATH: /api
    ports:
      - "8000:8000"
    depends_on:
@@ -1,16 +1,19 @@
-# Database Mode for Testing and Analytics
+# Database Storage and Caching
-This document explains how to use SPARC's database mode for storing LLM messages for testing and analytics purposes.
+This document explains how SPARC uses PostgreSQL for storing LLM messages, enabling response caching and analytics.
 ## Overview
-SPARC supports two modes of operation:
+SPARC stores all LLM interactions in PostgreSQL, providing:
-1. **API Mode** (default): Messages are sent to OpenRouter's API and you receive real LLM responses
+- **Response Caching**: Avoid redundant API calls for previously analyzed patents
-2. **Database Mode**: Messages are stored in a PostgreSQL database without making API calls, useful for:
+- **Analytics**: Track usage patterns, token consumption, and analysis history
-   - Testing the application without consuming API credits
+- **Persistence**: Maintain analysis history across sessions
-   - Collecting analytics on message patterns and usage
+
-   - Development and debugging
+SPARC supports two cache modes:
 1. **Cache Mode** (default, `USE_CACHE=true`): Check database for cached responses before making API calls
 2. **Fresh Mode** (`USE_CACHE=false`): Always make fresh API calls (still stores results in database)
 ## Setup
@@ -45,43 +48,43 @@ cp .env.example .env
 Edit `.env` and set:
 ```env
-# For database mode (testing/analytics)
+# Database connection (required)
 USE_DATABASE=true
 DATABASE_URL=postgresql://postgres:postgres@localhost:5432/sparc
-# For API mode (production)
+# Cache mode: use cached responses when available
-USE_DATABASE=false
+USE_CACHE=true
 # API key for fresh LLM calls
 OPENROUTER_API_KEY=your_openrouter_key_here
 ```
 ## Usage
-### Running in Database Mode
+### Running with Cache Mode (Default)
-Set `USE_DATABASE=true` in your `.env` file, then run the application normally:
+Set `USE_CACHE=true` in your `.env` file, then run the application normally:
 ```bash
 python main.py
 ```
-Instead of sending messages to OpenRouter, the application will:
+The application will:
- Store all prompts in the database
+- Check the database for cached responses matching the request
- Return a placeholder response
+- If found, return the cached response (no API call)
- Log metadata (company name, analysis type, timestamps)
+- If not found, make an API call and store the response for future use
-### Running in API Mode
+### Running with Fresh Mode
-Set `USE_DATABASE=false` in your `.env` file, then run the application normally:
+Set `USE_CACHE=false` in your `.env` file to always get fresh responses:
 ```bash
 python main.py
 ```
-The application will send messages to OpenRouter and return real LLM responses.
+The application will:
-
+- Always send messages to OpenRouter for real LLM responses
-### Hybrid Mode (Optional)
+- Store all responses in the database
-
+- Useful when you need the latest analysis or want to refresh cached data
 You can also enable database logging while still using the API by initializing the database client in your code. The `LLMAnalyzer` will automatically log all API calls to the database if a database client is available.
 ## Viewing Analytics
@@ -195,16 +198,16 @@ docker-compose down -v
 ## Toggling Between Modes
-You can easily switch between modes by changing the `USE_DATABASE` environment variable:
+You can easily switch between modes by changing the `USE_CACHE` environment variable:
-### Quick Toggle (temporary, for testing)
+### Quick Toggle (temporary)
 ```bash
-# Run in database mode
+# Run with caching enabled
-USE_DATABASE=true python main.py
+USE_CACHE=true python main.py
-# Run in API mode
+# Run with fresh API calls
-USE_DATABASE=false python main.py
+USE_CACHE=false python main.py
 ```
 ### Persistent Toggle
@@ -212,38 +215,48 @@ USE_DATABASE=false python main.py
 Edit your `.env` file:
 ```env
-# For testing/analytics
+# Use cached responses when available (recommended for most use)
-USE_DATABASE=true
+USE_CACHE=true
-# For production use
+# Always make fresh API calls
-USE_DATABASE=false
+USE_CACHE=false
 ```
 ## Use Cases
-### Testing Without API Costs
+### Cost Optimization with Caching
-During development, enable database mode to test the full application flow without consuming API credits:
+Cache mode reduces API costs by reusing previous analysis results:
 ```bash
-USE_DATABASE=true python main.py
+USE_CACHE=true python main.py
 ```
 If the same company/patent combination was analyzed before, the cached response is returned instantly.
 ### Fresh Analysis
 When you need the latest LLM analysis (e.g., after model updates):
 ```bash
 USE_CACHE=false python main.py
 ```
 ### Collecting Usage Analytics
-Enable database mode in a test environment to collect analytics on:
+The database stores all interactions, enabling analytics on:
 - Which companies are analyzed most frequently
 - Types of analyses performed
- Prompt patterns and lengths
+- Token usage and costs over time
- Usage over time
+- Response caching hit rates
 ### Development and Debugging
-Database mode is useful for:
+Database storage is useful for:
- Testing patent parsing logic without API calls
+- Reviewing actual prompts sent to the LLM
 - Analyzing response patterns
 - Debugging the full pipeline end-to-end
- Collecting sample prompts for optimization
+- Understanding token usage patterns
 - Understanding token usage patterns (when in API mode with logging)
 ## Troubleshooting
@@ -64,7 +64,7 @@ docker-compose ps
 # You should see:
 # - sparc-postgres (healthy)
 # - sparc-api (running on port 8000)
-# - sparc-dashboard (running on port 8501)
+# - sparc-dashboard (running on port 8080)
 ```
 The database is automatically initialized by the `init-db` service.
@@ -116,11 +116,13 @@ docker-compose up -d postgres
 # Wait for database to be healthy, then initialize
 python scripts/init_database.py
-# Terminal 1: Start FastAPI backend
+# Start FastAPI backend
 uvicorn SPARC.api:app --host 0.0.0.0 --port 8000 --reload
-# Terminal 2: Start Streamlit dashboard
+# For the React frontend (separate terminal)
-streamlit run dashboard.py --server.port 8501 --server.address 0.0.0.0
+cd frontend
 npm install
 npm run dev
 ```
 ---
@@ -141,7 +143,7 @@ Access the services:
 |---------|-----|
 | REST API | http://localhost:8000 |
 | API Documentation (Swagger) | http://localhost:8000/docs |
-| Dashboard (Web UI) | http://localhost:8501 |
+| Dashboard (Web UI) | http://localhost:8080 |
 ---
@@ -149,16 +151,17 @@ Access the services:
 ### Via Dashboard (Web UI)
-1. Open http://localhost:8501
+1. Open http://localhost:8080
-2. Select **"Company Analysis"** from the sidebar
+2. Register a new account or login (default admin: `admin` / `admin`)
-3. Enter a company name (e.g., "Intel")
+3. Navigate to **"Analysis"** from the sidebar
-4. Click **"Analyze"**
+4. Enter a company name (e.g., "Intel")
 5. Click **"Analyze"**
 This will:
 - Query SerpAPI for recent patents
 - Download and parse patent PDFs
 - Send patent content to Claude for analysis
- Store prompt/response in PostgreSQL
+- Store prompt/response in PostgreSQL (with caching)
 - Display results in the dashboard
 ### Via REST API
@@ -233,12 +236,12 @@ docker exec -it sparc-postgres psql -U postgres -d sparc -c \
 | Component | Purpose |
 |-----------|---------|
-| **Dashboard** | Streamlit web UI for interactive analysis |
+| **Dashboard** | React TypeScript web UI with authentication |
-| **FastAPI** | REST API for programmatic access |
+| **FastAPI** | REST API with JWT authentication |
 | **Analyzer** | Orchestrates patent retrieval and LLM analysis |
 | **SerpAPI** | Retrieves patent data from Google Patents |
 | **OpenRouter** | Routes requests to Claude for AI analysis |
-| **PostgreSQL** | Stores prompts, responses, and analytics |
+| **PostgreSQL** | Stores prompts, responses, users, and cached results |
 ---
@@ -248,10 +251,9 @@ docker exec -it sparc-postgres psql -U postgres -d sparc -c \
 |----------|----------|---------|-------------|
 | `API_KEY` | Yes | - | SerpAPI key for patent search |
 | `OPENROUTER_API_KEY` | Yes | - | OpenRouter API key for Claude access |
-| `DATABASE_URL` | Yes* | - | PostgreSQL connection string |
+| `DATABASE_URL` | Yes | - | PostgreSQL connection string |
-| `USE_DATABASE` | No | `false` | Set to `true` to enable database storage |
+| `USE_CACHE` | No | `true` | Check database for cached responses before API calls |
-
+| `JWT_SECRET` | Yes | - | Secret key for JWT authentication (change in production!) |
 *Required when `USE_DATABASE=true`
 ### Database URL Format
@@ -273,9 +275,9 @@ The `docker-compose.yml` includes all services needed for production:
 | Service | Container | Port | Description |
 |---------|-----------|------|-------------|
 | `postgres` | sparc-postgres | 5432 | PostgreSQL database |
-| `init-db` | sparc-init-db | - | One-time database initialization |
+| `init-db` | sparc-init-db | - | One-time database initialization (seeds admin user) |
-| `api` | sparc-api | 8000 | FastAPI REST API |
+| `api` | sparc-api | 8000 | FastAPI REST API with JWT auth |
-| `dashboard` | sparc-dashboard | 8501 | Streamlit web UI |
+| `dashboard` | sparc-dashboard | 8080 | React TypeScript web UI |
 ### Common Docker Compose Commands
@@ -382,11 +384,11 @@ cp .env.example .env
 docker-compose up -d postgres
 python scripts/init_database.py
 uvicorn SPARC.api:app --reload &
-streamlit run dashboard.py
+cd frontend && npm install && npm run dev &
 # Check status
 curl http://localhost:8000/health
-open http://localhost:8501
+open http://localhost:8080
 # View data
 python scripts/view_analytics.py
@@ -21,8 +21,11 @@ FROM nginx:alpine
 # Copy built files
 COPY --from=build /app/dist /usr/share/nginx/html
-# Copy nginx config
+# Copy nginx template (processed at startup with envsubst)
-COPY nginx.conf /etc/nginx/conf.d/default.conf
+COPY nginx.conf.template /etc/nginx/templates/default.conf.template
 # Default API URL (override with -e API_URL=...)
 ENV API_URL=http://api:8000/
 EXPOSE 80
@@ -15,7 +15,7 @@ server {
    # Proxy API requests to backend
    location /api/ {
-        proxy_pass http://api:8000/;
+        proxy_pass ${API_URL}/;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection 'upgrade';
@@ -1,11 +1,22 @@
 """Tests for the high-level company analyzer orchestration."""
 import pytest
-from unittest.mock import Mock, patch, call
+from unittest.mock import Mock, patch, call, MagicMock
 from SPARC.analyzer import CompanyAnalyzer
 from SPARC.types import Patent, Patents, CompanyAnalysisResult, BatchAnalysisResult
@pytest.fixture(autouse=True)
 def mock_db(mocker):
    """Mock DatabaseClient for all tests so no real DB connection is needed."""
    mock_db_cls = mocker.patch("SPARC.analyzer.DatabaseClient")
    mock_db_instance = MagicMock()
    mock_db_instance.get_cached_patent.return_value = None
    mock_db_instance.get_cached_serp_query.return_value = None
    mock_db_cls.return_value = mock_db_instance
    return mock_db_instance
 class TestCompanyAnalyzer:
    """Test the CompanyAnalyzer orchestration logic."""
@@ -17,7 +28,7 @@ class TestCompanyAnalyzer:
        mock_llm.assert_called_once_with(api_key="test-key")
-    def test_analyze_company_full_pipeline(self, mocker):
+    def test_analyze_company_full_pipeline(self, mocker, mock_db):
        """Test complete company analysis pipeline."""
        # Mock all the dependencies
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
@@ -178,6 +189,180 @@ class TestCompanyAnalyzer:
        assert "PDF not found" in result
 class TestSingleQueryBugFix:
    """Test that SERP.query is only called once per company analysis."""
    def test_analyze_company_safe_calls_query_once(self, mocker, mock_db):
        """_analyze_company_safe should call SERP.query exactly once."""
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
        mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
        mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        mock_query.return_value = Patents(patents=[patent])
        def save_side_effect(p):
            p.pdf_path = "patents/US123.pdf"
            return p
        mock_save.side_effect = save_side_effect
        mock_parse.return_value = {"abstract": "Test"}
        mock_minimize.return_value = "Content"
        mock_llm_instance = Mock()
        mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
        mock_llm.return_value = mock_llm_instance
        analyzer = CompanyAnalyzer()
        analyzer._analyze_company_safe("TestCorp")
        # The key assertion: SERP.query called exactly once, not twice
        mock_query.assert_called_once_with("TestCorp")
    def test_analyze_company_with_prefetched_patents_skips_query(self, mocker):
        """analyze_company should not call SERP.query when patents are provided."""
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
        mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
        mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        prefetched = Patents(patents=[patent])
        def save_side_effect(p):
            p.pdf_path = "patents/US123.pdf"
            return p
        mock_save.side_effect = save_side_effect
        mock_parse.return_value = {"abstract": "Test"}
        mock_minimize.return_value = "Content"
        mock_llm_instance = Mock()
        mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
        mock_llm.return_value = mock_llm_instance
        analyzer = CompanyAnalyzer()
        analyzer.analyze_company("TestCorp", patents=prefetched)
        # SERP.query should never be called
        mock_query.assert_not_called()
 class TestPatentCaching:
    """Test patent-level DB caching in the pipeline."""
    def test_process_single_patent_uses_db_cache(self, mocker, mock_db):
        """_process_single_patent returns cached content when available."""
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_db.get_cached_patent.return_value = {
            "patent_id": "US123",
            "minimized_content": "Cached minimized content",
        }
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
        assert result == {"patent_id": "US123", "content": "Cached minimized content"}
        # Should NOT download since cache hit
        mock_save.assert_not_called()
    def test_process_single_patent_stores_to_db_cache(self, mocker, mock_db):
        """_process_single_patent stores result in DB after processing."""
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
        mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
        # No cache hit
        mock_db.get_cached_patent.return_value = None
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        def save_side_effect(p):
            p.pdf_path = "patents/US123.pdf"
            return p
        mock_save.side_effect = save_side_effect
        mock_parse.return_value = {"abstract": "Test abstract"}
        mock_minimize.return_value = "Minimized content"
        result = CompanyAnalyzer._process_single_patent(patent, "TestCorp", mock_db)
        assert result == {"patent_id": "US123", "content": "Minimized content"}
        mock_db.store_patent.assert_called_once_with(
            patent_id="US123",
            company_name="TestCorp",
            pdf_link="http://example.com/test.pdf",
            raw_sections={"abstract": "Test abstract"},
            minimized_content="Minimized content",
        )
    def test_serp_query_cache_hit_skips_api(self, mocker, mock_db):
        """When SERP query is cached, API call is skipped."""
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
        mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
        mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
        # Simulate SERP cache hit
        mock_db.get_cached_serp_query.return_value = ["US123"]
        # Simulate patent cache hit too
        mock_db.get_cached_patent.return_value = {
            "patent_id": "US123",
            "minimized_content": "Cached content",
        }
        mock_llm_instance = Mock()
        mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
        mock_llm.return_value = mock_llm_instance
        analyzer = CompanyAnalyzer()
        result = analyzer.analyze_company("TestCorp")
        assert result == "Analysis"
        # SERP.query should NOT be called
        mock_query.assert_not_called()
        # No downloads should happen
        mock_save.assert_not_called()
    def test_serp_query_cache_miss_stores_result(self, mocker, mock_db):
        """When SERP query cache misses, result is stored after API call."""
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
        mock_parse = mocker.patch("SPARC.analyzer.SERP.parse_patent_pdf")
        mock_minimize = mocker.patch("SPARC.analyzer.SERP.minimize_patent_for_llm")
        mock_llm = mocker.patch("SPARC.analyzer.LLMAnalyzer")
        mock_db.get_cached_serp_query.return_value = None
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        mock_query.return_value = Patents(patents=[patent])
        def save_side_effect(p):
            p.pdf_path = "patents/US123.pdf"
            return p
        mock_save.side_effect = save_side_effect
        mock_parse.return_value = {"abstract": "Test"}
        mock_minimize.return_value = "Content"
        mock_llm_instance = Mock()
        mock_llm_instance.analyze_patent_portfolio.return_value = "Analysis"
        mock_llm.return_value = mock_llm_instance
        analyzer = CompanyAnalyzer()
        analyzer.analyze_company("TestCorp")
        mock_db.store_serp_query.assert_called_once()
        call_kwargs = mock_db.store_serp_query.call_args[1]
        assert call_kwargs["company_name"] == "TestCorp"
        assert call_kwargs["patent_ids"] == ["US123"]
 class TestBatchProcessing:
    """Test multi-company batch processing functionality."""
@@ -316,7 +501,7 @@ class TestBatchProcessing:
        assert callback.call_count == 2
-    def test_company_analysis_result_structure(self, mocker):
+    def test_company_analysis_result_structure(self, mocker, mock_db):
        """Test CompanyAnalysisResult has correct structure."""
        mock_query = mocker.patch("SPARC.analyzer.SERP.query")
        mock_save = mocker.patch("SPARC.analyzer.SERP.save_patents")
@@ -327,6 +512,9 @@ class TestBatchProcessing:
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        mock_query.return_value = Patents(patents=[patent])
        # Simulate DB caching: after store, subsequent get returns the IDs
        mock_db.get_cached_serp_query.side_effect = [None, ["US123"]]
        def save_side_effect(p):
            p.pdf_path = "patents/US123.pdf"
            return p
@@ -1,7 +1,11 @@
 """Tests for SERP API patent retrieval and parsing functionality."""
 import os
 import pytest
 from unittest.mock import patch, Mock
 from datetime import datetime, timedelta
 from SPARC.serp_api import SERP
 from SPARC.types import Patent
 class TestTextCleaning:
@@ -176,3 +180,89 @@ class TestPatentMinimization:
        # Sections should be separated by double newlines
        assert "\n\n" in result
 class TestDynamicDateRange:
    """Test dynamic date range computation in SERP.query."""
    def test_query_uses_rolling_date_window(self, mocker):
        """Verify the date filter uses a rolling window, not hardcoded dates."""
        mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
        mock_search.return_value = {"organic_results": []}
        mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
        mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
        SERP.query("TestCorp")
        call_params = mock_search.call_args[0][0]
        tbs = call_params["tbs"]
        # Should contain "cdr:1,cd_min:" with a date, not the old hardcoded one
        assert "cdr:1,cd_min:" in tbs
        assert "10/28/2025" not in tbs  # old hardcoded date gone
    def test_query_respects_days_back_param(self, mocker):
        """Verify days_back parameter controls the date window."""
        mock_search = mocker.patch("SPARC.serp_api.serpapi.search")
        mock_search.return_value = {"organic_results": []}
        mocker.patch("SPARC.serp_api.config.api_key", "fake-key")
        mocker.patch("SPARC.serp_api.config.patent_search_days", 90)
        now = datetime.now()
        SERP.query("TestCorp", days_back=30)
        call_params = mock_search.call_args[0][0]
        tbs = call_params["tbs"]
        expected_start = (now - timedelta(days=30)).strftime("%-m/%-d/%Y")
        assert expected_start in tbs
 class TestFilesystemPDFCaching:
    """Test that save_patents skips download for existing files."""
    def test_save_patents_skips_download_when_cached(self, mocker, tmp_path):
        """Already-downloaded PDFs should not be re-downloaded."""
        mock_get = mocker.patch("SPARC.serp_api.requests.get")
        mocker.patch("SPARC.serp_api.os.makedirs")
        pdf_path = tmp_path / "US123.pdf"
        pdf_path.write_bytes(b"%PDF-1.4 fake content")
        mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
        mocker.patch("SPARC.serp_api.os.path.getsize", return_value=100)
        patent = Patent(patent_id="US123", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)
        mock_get.assert_not_called()
        assert result.pdf_path == "patents/US123.pdf"
    def test_save_patents_downloads_when_not_cached(self, mocker):
        """Missing PDFs should be downloaded."""
        mock_response = Mock()
        mock_response.content = b"%PDF-1.4 content"
        mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
        mocker.patch("SPARC.serp_api.os.makedirs")
        mocker.patch("SPARC.serp_api.os.path.exists", return_value=False)
        mock_open = mocker.patch("builtins.open", mocker.mock_open())
        patent = Patent(patent_id="US456", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)
        mock_get.assert_called_once_with("http://example.com/test.pdf")
        assert result.pdf_path == "patents/US456.pdf"
    def test_save_patents_redownloads_empty_files(self, mocker):
        """Empty/corrupt PDFs (0 bytes) should be re-downloaded."""
        mock_response = Mock()
        mock_response.content = b"%PDF-1.4 content"
        mock_get = mocker.patch("SPARC.serp_api.requests.get", return_value=mock_response)
        mocker.patch("SPARC.serp_api.os.makedirs")
        mocker.patch("SPARC.serp_api.os.path.exists", return_value=True)
        mocker.patch("SPARC.serp_api.os.path.getsize", return_value=0)
        mock_open = mocker.patch("builtins.open", mocker.mock_open())
        patent = Patent(patent_id="US789", pdf_link="http://example.com/test.pdf")
        result = SERP.save_patents(patent)
        mock_get.assert_called_once()
        assert result.pdf_path == "patents/US789.pdf"
Author	SHA1	Message	Date
0xWheatyz	9c971dac72	fix(analyzer): route _analyze_company_safe through cache-aware path Build and Push Docker Images / build-api (push) Successful in 2m19s Details Build and Push Docker Images / build-frontend (push) Successful in 1m49s Details _analyze_company_safe was calling SERP.query directly, bypassing the SERP query cache in analyze_company. Now delegates fully to analyze_company() and reads patent_count from the serp_queries cache.	2026-03-24 15:02:19 -04:00
0xWheatyz	6f0b448044	test(analyzer,serp): add tests for caching, single query, and parallel processing - Add TestSingleQueryBugFix: verify SERP.query called once per analysis - Add TestPatentCaching: DB cache hit/miss, SERP query cache hit/miss - Add TestDynamicDateRange: rolling window, days_back param - Add TestFilesystemPDFCaching: skip download, redownload empty files - Add autouse mock_db fixture to prevent real DB connections in all tests	2026-03-24 14:39:09 -04:00
0xWheatyz	1a297eb60b	feat(analyzer): integrate DB patent and SERP query caching Before querying SERP API, check serp_queries cache (24h TTL). Before downloading/parsing each patent, check patents table for cached minimized_content. Store results after processing so repeated analyses skip all network I/O and PDF parsing entirely.	2026-03-24 14:35:24 -04:00
0xWheatyz	3154f6b732	feat(database): add patent/serp caching tables and connection pooling - Add patents table (patent_id PK, raw_sections JSONB, minimized_content) - Add serp_queries table (query_hash unique, result_patent_ids, expires_at) - Add cache methods: get/store_patent, get/store_serp_query - Replace single connection with ThreadedConnectionPool (min=2, max=10) - Add get_conn() context manager for thread-safe connection checkout - Legacy single-connection path preserved for backwards compatibility	2026-03-24 14:34:33 -04:00
0xWheatyz	b9bb3dc1cd	perf(analyzer): parallelize patent download/parse/minimize with threads Replace the sequential per-patent loop with a ThreadPoolExecutor (workers controlled by PATENT_THREAD_WORKERS config). Each patent is processed independently in _process_single_patent, which is thread-safe since SERP methods are stateless and operate on separate files.	2026-03-24 14:32:23 -04:00
0xWheatyz	90f9cfc826	fix(serp): replace hardcoded date range with rolling window The SERP query had a frozen date range (Oct-Nov 2025) that returned stale patents. Now computes a rolling window from config (PATENT_SEARCH_DAYS, default 90 days). Also adds filesystem-level PDF caching to skip re-downloading existing patent PDFs, and adds PATENT_THREAD_WORKERS config for upcoming parallel processing.	2026-03-24 14:31:43 -04:00
0xWheatyz	d387bbbdf3	fix(analyzer): eliminate double SERP.query() call per company analysis _analyze_company_safe called SERP.query() then passed the company name to analyze_company() which called SERP.query() again — doubling API usage. Now analyze_company() accepts an optional patents param so callers can pass pre-fetched results through.	2026-03-24 14:16:49 -04:00
0xWheatyz	fa564e5e1e	chore: forcing new git commit Build and Push Docker Images / build-frontend (push) Successful in 1m39s Details Build and Push Docker Images / build-api (push) Successful in 3m22s Details	2026-03-23 17:45:42 -04:00
0xWheatyz	2815deb221	fix(api): configure root_path for OpenAPI docs behind reverse proxy Build and Push Docker Images / build-api (push) Successful in 11s Details Build and Push Docker Images / build-frontend (push) Successful in 29s Details Add ROOT_PATH environment variable support so FastAPI generates correct URLs for Swagger UI when served behind nginx at /api. This fixes the "invalid version field" error when accessing /api/docs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 11:48:11 -04:00
0xWheatyz	56e8287720	fix(nginx): strip /api/ prefix when proxying to backend Build and Push Docker Images / build-frontend (push) Successful in 21s Details Build and Push Docker Images / build-api (push) Successful in 45s Details Add trailing slash to proxy_pass directive so nginx strips the /api/ prefix before forwarding requests to the API container. This fixes routes like /api/docs being passed as /api/docs instead of /docs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 02:51:28 -04:00
0xWheatyz	b969423957	chore(gitlab): cleaned up old git ci files Build and Push Docker Images / build-api (push) Successful in 9s Details Build and Push Docker Images / build-frontend (push) Successful in 31s Details	2026-03-15 02:40:28 -04:00
0xWheatyz	0dee4c5099	feat(ci): add timestamp-based image tags with commit hash Build and Push Docker Images / build-frontend (push) Successful in 5s Details Build and Push Docker Images / build-api (push) Successful in 18s Details Push images with versioned tags in format TIMESTAMP-COMMIT and frontend-TIMESTAMP-COMMIT for better traceability and rollback support. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 02:39:38 -04:00
0xWheatyz	03105a2f87	feat(ci): add timestamp-based image tags with commit hash Build and Push Docker Images / build-frontend (push) Successful in 6s Details Build and Push Docker Images / build-api (push) Successful in 18s Details Push images with versioned tags in format TIMESTAMP-COMMIT and frontend-TIMESTAMP-COMMIT for better traceability and rollback support. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 02:35:33 -04:00
0xWheatyz	28e2ded501	feat(frontend): make API endpoint configurable via environment variable Build and Push Docker Images / build-api (push) Successful in 17s Details Build and Push Docker Images / build-frontend (push) Successful in 23s Details Use nginx template support to allow API_URL to be passed at container runtime, enabling the same image to be deployed to different environments. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 01:09:15 -04:00
0xWheatyz	f87572ab7e	fix(ci): changed port to 80 as 3000 does not seem to work Build and Push Docker Images / build-frontend (push) Successful in 1m38s Details Build and Push Docker Images / build-api (push) Successful in 1m51s Details	2026-03-15 00:43:00 -04:00
0xWheatyz	44b6c79713	fix(ci): changed port to 3000 as 80 does not seem to work Build and Push Docker Images / build-frontend (push) Failing after 5s Details Build and Push Docker Images / build-api (push) Failing after 7s Details	2026-03-15 00:24:13 -04:00
0xWheatyz	13fe383116	fix(ci): use explicit port 80 for insecure registry Build and Push Docker Images / build-api (push) Successful in 2m50s Details Build and Push Docker Images / build-frontend (push) Successful in 1m20s Details - Remove http:// prefix from docker login (Docker ignores it) - Add :80 to registry address so Docker uses HTTP - Remove redundant daemon.json config (configured at daemon level) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-15 00:13:46 -04:00
0xWheatyz	dee3cbefbd	fix(ci): change internal dns name to specify http and the port Build and Push Docker Images / build-api (push) Failing after 5s Details Build and Push Docker Images / build-frontend (push) Failing after 8s Details	2026-03-15 00:06:42 -04:00
0xWheatyz	6acad4cff7	fix(ci): configure docker to use HTTP for internal registry Build and Push Docker Images / build-frontend (push) Failing after 8s Details Build and Push Docker Images / build-api (push) Failing after 10s Details Add insecure-registries configuration to allow HTTP connections to gitea.gitea.svc.cluster.local instead of HTTPS. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-14 19:37:14 -04:00
0xWheatyz	45ccd0b4e1	fix(ci): docker internal dns name does not support https Build and Push Docker Images / build-frontend (push) Failing after 5s Details Build and Push Docker Images / build-api (push) Failing after 7s Details	2026-03-14 19:19:20 -04:00
0xWheatyz	d108d4c7ea	fix(ci): internal dns name does not support https Build and Push Docker Images / build-api (push) Failing after 6s Details Build and Push Docker Images / build-frontend (push) Failing after 6s Details	2026-03-14 19:16:45 -04:00
0xWheatyz	068aecce61	fix(ci): moved domain to internal dns name, hopefully runner respects that and this negates the 502 error when too many requests are sent to vps Build and Push Docker Images / build-frontend (push) Failing after 4s Details Build and Push Docker Images / build-api (push) Failing after 6s Details	2026-03-14 19:15:15 -04:00
0xWheatyz	8790abfbf7	Merge pull request 'rewrite/frontend' (#2 ) from rewrite/frontend into main Build and Push Docker Images / build-api (push) Has been cancelled Details Build and Push Docker Images / build-frontend (push) Has been cancelled Details Reviewed-on: http://10.0.1.10/0xWheatyz/SPARC/pulls/2	2026-03-14 22:02:12 +00:00
0xWheatyz	fe0c5ca280	ci: add parallel frontend build job to workflow Split the single build job into two parallel jobs (build-api and build-frontend) to enable simultaneous container builds when multiple runners are available. Frontend images are tagged with frontend- prefix. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-14 17:37:04 -04:00
0xWheatyz	ed81ae4569	docs: update documentation for React frontend and cache mode Update all documentation to reflect recent changes: - Replace Streamlit dashboard references with React TypeScript dashboard - Update dashboard port from 8501 to 8080 - Add auth.py and database.py to architecture section - Change USE_DATABASE terminology to USE_CACHE - Add JWT_SECRET to environment variables reference - Document default admin credentials and user seeding 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2026-03-14 14:30:21 -04:00