Files
SPARC/SPARC/serp_api.py
T
agent-company fbb72fe2a5 ci: add pytest and ruff linting to CI, fix all lint errors
- Add test job to build.yaml that runs pytest and ruff before building images
- Add standalone test.yaml workflow for PRs
- Add ruff.toml with E/F/I rules configured
- Fix all ruff lint errors: sort imports, remove unused imports, fix re-exports
- Build jobs now depend on test job passing (needs: test)

Closes leeworks-agents/SPARC#18
Closes leeworks-agents/SPARC#19

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 07:04:00 +00:00

194 lines
6.1 KiB
Python

import os
import re
from datetime import datetime, timedelta
from typing import Dict
import pdfplumber # pip install pdfplumber
import requests
import serpapi
from SPARC import config
from SPARC.types import Patent, Patents
class SERP:
def query(company: str, days_back: int = None) -> Patents:
"""Query Google Patents for a company's recent patents.
Args:
company: Name of the company to search for
days_back: Number of days to look back for patents (default from config)
Returns:
Patents object containing list of patents with PDF links
Note:
Patents without PDF download links are skipped. This occurs when
Google Patents doesn't have a PDF available for a particular patent
(e.g., recently filed patents, certain international patents, or
patents with restricted access). The returned count may be lower
than the requested number of results.
"""
if days_back is None:
days_back = config.patent_search_days
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
date_filter = f"cdr:1,cd_min:{start_date.strftime('%-m/%-d/%Y')},cd_max:{end_date.strftime('%-m/%-d/%Y')}"
# Make API call
params = {
"engine": "google_patents",
"q": company,
"num": 10,
"filter": 1,
"tbs": date_filter,
"api_key": config.api_key,
}
search = serpapi.search(params)
# Convert results to Patent objects, skipping any without PDF links
patent_ids = []
list_of_patents = search["organic_results"]
for patent in list_of_patents:
pdf_link = patent.get("pdf")
if pdf_link:
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=pdf_link, summary=None))
# Patents without PDF links are skipped (see docstring for details)
return Patents(patents=patent_ids)
def save_patents(patent: Patent) -> Patent:
"""
Save the patent PDF to the patents folder, skipping download if already cached.
Args:
patent: Patent object
Returns:
Patent object with updated PDF path
"""
pdf_path = f"patents/{patent.patent_id}.pdf"
os.makedirs("patents", exist_ok=True)
if not (os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0):
response = requests.get(patent.pdf_link)
with open(pdf_path, "wb") as f:
f.write(response.content)
patent.pdf_path = pdf_path
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF.
Extracts all major sections from a patent PDF including abstract,
claims, summary, and detailed description.
Args:
pdf_path: Path to the patent PDF file
Returns:
Dictionary containing all extracted sections
"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
'abstract': SERP.extract_section(
full_text,
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
),
'claims': SERP.extract_section(
full_text,
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
),
'summary': SERP.extract_section(
full_text,
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
),
'description': SERP.extract_section(
full_text,
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
)
}
return sections
def minimize_patent_for_llm(sections: Dict) -> str:
"""Minimize patent content for LLM consumption.
Removes bloated sections (detailed description) and keeps only
essential information: abstract, claims, and summary. This reduces
token usage while preserving the core innovation details.
Args:
sections: Dictionary of parsed patent sections from parse_patent_pdf()
Returns:
Concatenated string of essential patent sections ready for LLM analysis
"""
essential_parts = []
# Abstract: Concise overview of the invention
if sections.get('abstract'):
essential_parts.append("ABSTRACT:\n" + sections['abstract'])
# Claims: The actual legal claims defining the invention (most important)
if sections.get('claims'):
essential_parts.append("CLAIMS:\n" + sections['claims'])
# Summary: High-level description of the invention
if sections.get('summary'):
essential_parts.append("SUMMARY:\n" + sections['summary'])
# Explicitly exclude 'description' - it's too verbose and contains
# implementation details not needed for high-level analysis
return "\n\n".join(essential_parts)
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""
# Find start position
start_pos = None
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
if start_pos is None:
return ""
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
if match:
end_pos = start_pos + match.start()
break
# Extract and clean
section_text = text[start_pos:end_pos].strip()
return SERP.clean_patent_text(section_text)
def clean_patent_text(text: str) -> str:
"""Remove noise from extracted text"""
# Remove excessive whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
# Remove figure references
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
# Remove line numbers (common in PDFs)
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
return text.strip()