SPARC/SPARC/serp_api.py

import os
import re
from datetime import datetime, timedelta
from typing import Dict

import pdfplumber  # pip install pdfplumber
import requests
import serpapi

from SPARC import config
from SPARC.types import Patent, Patents


class SERP:
  def query(company: str, days_back: int = None) -> Patents:
    """Query Google Patents for a company's recent patents.

    Args:
        company: Name of the company to search for
        days_back: Number of days to look back for patents (default from config)

    Returns:
        Patents object containing list of patents with PDF links

    Note:
        Patents without PDF download links are skipped. This occurs when
        Google Patents doesn't have a PDF available for a particular patent
        (e.g., recently filed patents, certain international patents, or
        patents with restricted access). The returned count may be lower
        than the requested number of results.
    """
    if days_back is None:
        days_back = config.patent_search_days
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"

    # Make API call
    params = {
      "engine": "google_patents",
      "q": company,
      "num": 10,
      "filter": 1,
      "tbs": date_filter,
      "api_key": config.api_key,
    }
    search = serpapi.search(params)
    # Convert results to Patent objects, skipping any without PDF links
    patent_ids = []
    list_of_patents = search["organic_results"]
    for patent in list_of_patents:
        pdf_link = patent.get("pdf")
        if pdf_link:
            patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=pdf_link, summary=None))
        # Patents without PDF links are skipped (see docstring for details)

    return Patents(patents=patent_ids)

  def save_patents(patent: Patent) -> Patent:
    """
    Save the patent PDF to the patents folder, skipping download if already cached.

    Args:
      patent: Patent object

    Returns:
      Patent object with updated PDF path
    """
    pdf_path = f"patents/{patent.patent_id}.pdf"
    os.makedirs("patents", exist_ok=True)

    if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
      response = requests.get(patent.pdf_link)
      with open(pdf_path, "wb") as f:
        f.write(response.content)

    patent.pdf_path = pdf_path
    return patent

  def parse_patent_pdf(pdf_path: str) -> Dict:
    """Extract structured sections from patent PDF.

    Extracts all major sections from a patent PDF including abstract,
    claims, summary, and detailed description.

    Args:
      pdf_path: Path to the patent PDF file

    Returns:
      Dictionary containing all extracted sections
    """

    with pdfplumber.open(pdf_path) as pdf:
      # Extract all text
      full_text = ""
      for page in pdf.pages:
        full_text += page.extract_text() + "\n"

    # Define section patterns (common in patents)
    sections = {
      'abstract': SERP.extract_section(
        full_text,
        start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
        end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
      ),
      'claims': SERP.extract_section(
        full_text,
        start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
        end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$']  # Often at end
      ),
      'summary': SERP.extract_section(
        full_text,
        start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
        end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
      ),
      'description': SERP.extract_section(
        full_text,
        start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
        end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
      )
    }

    return sections

  def minimize_patent_for_llm(sections: Dict) -> str:
    """Minimize patent content for LLM consumption.

    Removes bloated sections (detailed description) and keeps only
    essential information: abstract, claims, and summary. This reduces
    token usage while preserving the core innovation details.

    Args:
      sections: Dictionary of parsed patent sections from parse_patent_pdf()

    Returns:
      Concatenated string of essential patent sections ready for LLM analysis
    """
    essential_parts = []

    # Abstract: Concise overview of the invention
    if sections.get('abstract'):
      essential_parts.append("ABSTRACT:\n" + sections['abstract'])

    # Claims: The actual legal claims defining the invention (most important)
    if sections.get('claims'):
      essential_parts.append("CLAIMS:\n" + sections['claims'])

    # Summary: High-level description of the invention
    if sections.get('summary'):
      essential_parts.append("SUMMARY:\n" + sections['summary'])

    # Explicitly exclude 'description' - it's too verbose and contains
    # implementation details not needed for high-level analysis

    return "\n\n".join(essential_parts)

  def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
    """Extract text between start and end patterns"""

    # Find start position
    start_pos = None
    for pattern in start_patterns:
      match = re.search(pattern, text, re.IGNORECASE)
      if match:
        start_pos = match.end()
        break

    if start_pos is None:
      return ""

    # Find end position
    end_pos = len(text)
    for pattern in end_patterns:
      match = re.search(pattern, text[start_pos:], re.IGNORECASE)
      if match:
        end_pos = start_pos + match.start()
        break

    # Extract and clean
    section_text = text[start_pos:end_pos].strip()
    return SERP.clean_patent_text(section_text)

  def clean_patent_text(text: str) -> str:
    """Remove noise from extracted text"""
    # Remove excessive whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    # Remove figure references
    text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
    text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
    # Remove line numbers (common in PDFs)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    return text.strip()