import os import re from datetime import datetime, timedelta from typing import Dict import pdfplumber # pip install pdfplumber import requests import serpapi from SPARC import config from SPARC.types import Patent, Patents class SERP: def query(company: str, days_back: int = None) -> Patents: """Query Google Patents for a company's recent patents. Args: company: Name of the company to search for days_back: Number of days to look back for patents (default from config) Returns: Patents object containing list of patents with PDF links Note: Patents without PDF download links are skipped. This occurs when Google Patents doesn't have a PDF available for a particular patent (e.g., recently filed patents, certain international patents, or patents with restricted access). The returned count may be lower than the requested number of results. """ if days_back is None: days_back = config.patent_search_days end_date = datetime.now() start_date = end_date - timedelta(days=days_back) date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}" # Make API call params = { "engine": "google_patents", "q": company, "num": 10, "filter": 1, "tbs": date_filter, "api_key": config.api_key, } search = serpapi.search(params) # Convert results to Patent objects, skipping any without PDF links patent_ids = [] list_of_patents = search["organic_results"] for patent in list_of_patents: pdf_link = patent.get("pdf") if pdf_link: patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=pdf_link, summary=None)) # Patents without PDF links are skipped (see docstring for details) return Patents(patents=patent_ids) def save_patents(patent: Patent) -> Patent: """ Save the patent PDF to the patents folder, skipping download if already cached. Args: patent: Patent object Returns: Patent object with updated PDF path """ pdf_path = f"patents/{patent.patent_id}.pdf" os.makedirs("patents", exist_ok=True) if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0): response = requests.get(patent.pdf_link) with open(pdf_path, "wb") as f: f.write(response.content) patent.pdf_path = pdf_path return patent def parse_patent_pdf(pdf_path: str) -> Dict: """Extract structured sections from patent PDF. Extracts all major sections from a patent PDF including abstract, claims, summary, and detailed description. Args: pdf_path: Path to the patent PDF file Returns: Dictionary containing all extracted sections """ with pdfplumber.open(pdf_path) as pdf: # Extract all text full_text = "" for page in pdf.pages: full_text += page.extract_text() + "\n" # Define section patterns (common in patents) sections = { 'abstract': SERP.extract_section( full_text, start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'], end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION'] ), 'claims': SERP.extract_section( full_text, start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'], end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end ), 'summary': SERP.extract_section( full_text, start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'], end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION'] ), 'description': SERP.extract_section( full_text, start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'], end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:'] ) } return sections def minimize_patent_for_llm(sections: Dict) -> str: """Minimize patent content for LLM consumption. Removes bloated sections (detailed description) and keeps only essential information: abstract, claims, and summary. This reduces token usage while preserving the core innovation details. Args: sections: Dictionary of parsed patent sections from parse_patent_pdf() Returns: Concatenated string of essential patent sections ready for LLM analysis """ essential_parts = [] # Abstract: Concise overview of the invention if sections.get('abstract'): essential_parts.append("ABSTRACT:\n" + sections['abstract']) # Claims: The actual legal claims defining the invention (most important) if sections.get('claims'): essential_parts.append("CLAIMS:\n" + sections['claims']) # Summary: High-level description of the invention if sections.get('summary'): essential_parts.append("SUMMARY:\n" + sections['summary']) # Explicitly exclude 'description' - it's too verbose and contains # implementation details not needed for high-level analysis return "\n\n".join(essential_parts) def extract_section(text: str, start_patterns: list, end_patterns: list) -> str: """Extract text between start and end patterns""" # Find start position start_pos = None for pattern in start_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: start_pos = match.end() break if start_pos is None: return "" # Find end position end_pos = len(text) for pattern in end_patterns: match = re.search(pattern, text[start_pos:], re.IGNORECASE) if match: end_pos = start_pos + match.start() break # Extract and clean section_text = text[start_pos:end_pos].strip() return SERP.clean_patent_text(section_text) def clean_patent_text(text: str) -> str: """Remove noise from extracted text""" # Remove excessive whitespace text = re.sub(r'\n\s*\n', '\n\n', text) # Remove figure references text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text) text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text) # Remove line numbers (common in PDFs) text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) return text.strip()