import io import logging import re from datetime import datetime, timedelta from typing import Dict import pdfplumber # pip install pdfplumber import requests import serpapi from SPARC import config from SPARC.storage import StorageBackend, get_storage_backend from SPARC.types import Patent, Patents logger = logging.getLogger(__name__) # Module-level storage instance (lazy-initialized) _storage: StorageBackend | None = None def _get_storage() -> StorageBackend: global _storage if _storage is None: _storage = get_storage_backend() return _storage class SERP: def query(company: str, days_back: int = None) -> Patents: """Query Google Patents for a company's recent patents. Args: company: Name of the company to search for days_back: Number of days to look back for patents (default from config) Returns: Patents object containing list of patents with PDF links Note: Patents without PDF download links are skipped. This occurs when Google Patents doesn't have a PDF available for a particular patent (e.g., recently filed patents, certain international patents, or patents with restricted access). The returned count may be lower than the requested number of results. """ if days_back is None: days_back = config.patent_search_days end_date = datetime.now() start_date = end_date - timedelta(days=days_back) date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}" # Make API call params = { "engine": "google_patents", "q": company, "num": 10, "filter": 1, "tbs": date_filter, "api_key": config.api_key, } logger.info("Querying Google Patents for '%s' (last %d days)", company, days_back) search = serpapi.search(params) # Convert results to Patent objects, skipping any without PDF links patent_ids = [] list_of_patents = search["organic_results"] for patent in list_of_patents: pdf_link = patent.get("pdf") if pdf_link: patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=pdf_link, summary=None)) else: logger.debug("Skipping patent %s (no PDF link)", patent.get("publication_number", "unknown")) logger.info("Found %d patents with PDF links for '%s'", len(patent_ids), company) return Patents(patents=patent_ids) def save_patents(patent: Patent) -> Patent: """Save the patent PDF to storage, skipping download if already cached. Uses the configured storage backend (local filesystem or S3). Args: patent: Patent object Returns: Patent object with updated PDF path """ storage = _get_storage() key = f"{patent.patent_id}.pdf" if not storage.exists(key): logger.info("Downloading PDF for %s", patent.patent_id) response = requests.get(patent.pdf_link) storage.write(key, response.content) logger.debug("Saved %d bytes for %s", len(response.content), patent.patent_id) else: logger.debug("Using cached PDF for %s", patent.patent_id) patent.pdf_path = storage.path_for(key) return patent def parse_patent_pdf(pdf_path: str) -> Dict: """Extract structured sections from patent PDF. Extracts all major sections from a patent PDF including abstract, claims, summary, and detailed description. Supports both local file paths and S3 URIs (s3://bucket/key). Args: pdf_path: Local path or S3 URI to the patent PDF file Returns: Dictionary containing all extracted sections """ logger.debug("Parsing patent PDF: %s", pdf_path) if pdf_path.startswith("s3://"): # Read from S3 via storage backend storage = _get_storage() # Extract key from "s3://bucket/key" key = pdf_path.split("/", 3)[-1] data = storage.read(key) pdf_file: io.BytesIO | str = io.BytesIO(data) else: pdf_file = pdf_path with pdfplumber.open(pdf_file) as pdf: # Extract all text full_text = "" for page in pdf.pages: full_text += page.extract_text() + "\n" logger.debug("Extracted text from %d pages (%d chars)", len(pdf.pages), len(full_text)) # Define section patterns (common in patents) sections = { 'abstract': SERP.extract_section( full_text, start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'], end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION'] ), 'claims': SERP.extract_section( full_text, start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'], end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end ), 'summary': SERP.extract_section( full_text, start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'], end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION'] ), 'description': SERP.extract_section( full_text, start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'], end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:'] ) } return sections def minimize_patent_for_llm(sections: Dict) -> str: """Minimize patent content for LLM consumption. Removes bloated sections (detailed description) and keeps only essential information: abstract, claims, and summary. This reduces token usage while preserving the core innovation details. Args: sections: Dictionary of parsed patent sections from parse_patent_pdf() Returns: Concatenated string of essential patent sections ready for LLM analysis """ essential_parts = [] # Abstract: Concise overview of the invention if sections.get('abstract'): essential_parts.append("ABSTRACT:\n" + sections['abstract']) # Claims: The actual legal claims defining the invention (most important) if sections.get('claims'): essential_parts.append("CLAIMS:\n" + sections['claims']) # Summary: High-level description of the invention if sections.get('summary'): essential_parts.append("SUMMARY:\n" + sections['summary']) # Explicitly exclude 'description' - it's too verbose and contains # implementation details not needed for high-level analysis return "\n\n".join(essential_parts) def extract_section(text: str, start_patterns: list, end_patterns: list) -> str: """Extract text between start and end patterns""" # Find start position start_pos = None for pattern in start_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: start_pos = match.end() break if start_pos is None: return "" # Find end position end_pos = len(text) for pattern in end_patterns: match = re.search(pattern, text[start_pos:], re.IGNORECASE) if match: end_pos = start_pos + match.start() break # Extract and clean section_text = text[start_pos:end_pos].strip() return SERP.clean_patent_text(section_text) def clean_patent_text(text: str) -> str: """Remove noise from extracted text""" # Remove excessive whitespace text = re.sub(r'\n\s*\n', '\n\n', text) # Remove figure references text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text) text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text) # Remove line numbers (common in PDFs) text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) return text.strip()