Implemented minimize_patent_for_llm() function that reduces patent content by keeping only essential sections (abstract, claims, summary) and explicitly excludes the verbose detailed description section. This reduces token usage while preserving core innovation details needed for company performance estimation. Added comprehensive test coverage (5 new tests) for: - Essential section inclusion - Description section exclusion - Missing section handling - Empty section handling - Section separator formatting All 13 tests passing. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
161 lines
4.9 KiB
Python
161 lines
4.9 KiB
Python
import serpapi
|
|
from SPARC import config
|
|
import re
|
|
import pdfplumber # pip install pdfplumber
|
|
import requests
|
|
from typing import Dict
|
|
from SPARC.types import Patents, Patent
|
|
|
|
class SERP:
|
|
def query(company: str) -> Patents:
|
|
# Make API call
|
|
params = {
|
|
"engine": "google_patents",
|
|
"q": company,
|
|
"num": 10,
|
|
"filter": 1,
|
|
"tbs": "cdr:1,cd_min:10/28/2025,cd_max:11/4/2025",
|
|
"api_key": config.api_key,
|
|
}
|
|
search = serpapi.search(params)
|
|
# Convert data into a list of publicationID
|
|
patent_ids = []
|
|
list_of_patents = search["organic_results"]
|
|
for patent in list_of_patents:
|
|
patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
|
|
|
|
return Patents(patents=patent_ids)
|
|
|
|
def save_patents(patent: Patent) -> Patent:
|
|
"""
|
|
Save the patent PDF to the patents folder
|
|
|
|
Args:
|
|
patent: Patent object
|
|
|
|
Returns:
|
|
Patent object with updated PDF path
|
|
"""
|
|
response = requests.get(patent.pdf_link)
|
|
print(patent.pdf_link)
|
|
with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
|
|
f.write(response.content)
|
|
|
|
patent.pdf_path = f"patents/{patent.patent_id}.pdf"
|
|
return patent
|
|
|
|
def parse_patent_pdf(pdf_path: str) -> Dict:
|
|
"""Extract structured sections from patent PDF.
|
|
|
|
Extracts all major sections from a patent PDF including abstract,
|
|
claims, summary, and detailed description.
|
|
|
|
Args:
|
|
pdf_path: Path to the patent PDF file
|
|
|
|
Returns:
|
|
Dictionary containing all extracted sections
|
|
"""
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
# Extract all text
|
|
full_text = ""
|
|
for page in pdf.pages:
|
|
full_text += page.extract_text() + "\n"
|
|
|
|
# Define section patterns (common in patents)
|
|
sections = {
|
|
'abstract': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
|
|
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
|
|
),
|
|
'claims': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
|
|
end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$'] # Often at end
|
|
),
|
|
'summary': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
|
|
end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
|
|
),
|
|
'description': SERP.extract_section(
|
|
full_text,
|
|
start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
|
|
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
|
|
)
|
|
}
|
|
|
|
return sections
|
|
|
|
def minimize_patent_for_llm(sections: Dict) -> str:
|
|
"""Minimize patent content for LLM consumption.
|
|
|
|
Removes bloated sections (detailed description) and keeps only
|
|
essential information: abstract, claims, and summary. This reduces
|
|
token usage while preserving the core innovation details.
|
|
|
|
Args:
|
|
sections: Dictionary of parsed patent sections from parse_patent_pdf()
|
|
|
|
Returns:
|
|
Concatenated string of essential patent sections ready for LLM analysis
|
|
"""
|
|
essential_parts = []
|
|
|
|
# Abstract: Concise overview of the invention
|
|
if sections.get('abstract'):
|
|
essential_parts.append("ABSTRACT:\n" + sections['abstract'])
|
|
|
|
# Claims: The actual legal claims defining the invention (most important)
|
|
if sections.get('claims'):
|
|
essential_parts.append("CLAIMS:\n" + sections['claims'])
|
|
|
|
# Summary: High-level description of the invention
|
|
if sections.get('summary'):
|
|
essential_parts.append("SUMMARY:\n" + sections['summary'])
|
|
|
|
# Explicitly exclude 'description' - it's too verbose and contains
|
|
# implementation details not needed for high-level analysis
|
|
|
|
return "\n\n".join(essential_parts)
|
|
|
|
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
|
|
"""Extract text between start and end patterns"""
|
|
|
|
# Find start position
|
|
start_pos = None
|
|
for pattern in start_patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
start_pos = match.end()
|
|
break
|
|
|
|
if start_pos is None:
|
|
return ""
|
|
|
|
# Find end position
|
|
end_pos = len(text)
|
|
for pattern in end_patterns:
|
|
match = re.search(pattern, text[start_pos:], re.IGNORECASE)
|
|
if match:
|
|
end_pos = start_pos + match.start()
|
|
break
|
|
|
|
# Extract and clean
|
|
section_text = text[start_pos:end_pos].strip()
|
|
return SERP.clean_patent_text(section_text)
|
|
|
|
def clean_patent_text(text: str) -> str:
|
|
"""Remove noise from extracted text"""
|
|
# Remove excessive whitespace
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
# Remove figure references
|
|
text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
|
|
text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
|
|
# Remove line numbers (common in PDFs)
|
|
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
|
return text.strip()
|
|
|