feat: add patent content minimization for LLM consumption

Implemented minimize_patent_for_llm() function that reduces patent
content by keeping only essential sections (abstract, claims, summary)
and explicitly excludes the verbose detailed description section.

This reduces token usage while preserving core innovation details
needed for company performance estimation.

Added comprehensive test coverage (5 new tests) for:
- Essential section inclusion
- Description section exclusion
- Missing section handling
- Empty section handling
- Section separator formatting

All 13 tests passing.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
0xWheatyz 2026-02-19 18:54:07 -05:00
parent 6533cef56f
commit 26a23c02ae
2 changed files with 121 additions and 5 deletions

View File

@ -45,18 +45,28 @@ class SERP:
return patent
def parse_patent_pdf(pdf_path: str) -> Dict:
"""Extract structured sections from patent PDF"""
"""Extract structured sections from patent PDF.
Extracts all major sections from a patent PDF including abstract,
claims, summary, and detailed description.
Args:
pdf_path: Path to the patent PDF file
Returns:
Dictionary containing all extracted sections
"""
with pdfplumber.open(pdf_path) as pdf:
# Extract all text
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define section patterns (common in patents)
sections = {
'abstract': SERP.extract_section(
full_text,
full_text,
start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
),
@ -76,9 +86,41 @@ class SERP:
end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
)
}
return sections
def minimize_patent_for_llm(sections: Dict) -> str:
"""Minimize patent content for LLM consumption.
Removes bloated sections (detailed description) and keeps only
essential information: abstract, claims, and summary. This reduces
token usage while preserving the core innovation details.
Args:
sections: Dictionary of parsed patent sections from parse_patent_pdf()
Returns:
Concatenated string of essential patent sections ready for LLM analysis
"""
essential_parts = []
# Abstract: Concise overview of the invention
if sections.get('abstract'):
essential_parts.append("ABSTRACT:\n" + sections['abstract'])
# Claims: The actual legal claims defining the invention (most important)
if sections.get('claims'):
essential_parts.append("CLAIMS:\n" + sections['claims'])
# Summary: High-level description of the invention
if sections.get('summary'):
essential_parts.append("SUMMARY:\n" + sections['summary'])
# Explicitly exclude 'description' - it's too verbose and contains
# implementation details not needed for high-level analysis
return "\n\n".join(essential_parts)
def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
"""Extract text between start and end patterns"""

View File

@ -102,3 +102,77 @@ class TestSectionExtraction:
end_patterns=[r"BACKGROUND"],
)
assert "This is the abstract in lowercase" in result
class TestPatentMinimization:
"""Test patent content minimization for LLM consumption."""
def test_minimize_includes_all_essential_sections(self):
"""Test that all essential sections are included in minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description...",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
assert "CLAIMS:" in result
assert "1. A method for doing X." in result
assert "SUMMARY:" in result
assert "This invention relates to X." in result
def test_minimize_excludes_description(self):
"""Test that detailed description is excluded from minimized output."""
sections = {
"abstract": "This is the abstract.",
"claims": "1. A method for doing X.",
"summary": "This invention relates to X.",
"description": "Very long detailed description that should be excluded.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "Very long detailed description" not in result
assert "DESCRIPTION:" not in result
def test_minimize_handles_missing_sections(self):
"""Test that minimization handles missing sections gracefully."""
sections = {
"abstract": "This is the abstract.",
# claims missing
# summary missing
"description": "Description text.",
}
result = SERP.minimize_patent_for_llm(sections)
assert "ABSTRACT:" in result
assert "This is the abstract." in result
# Should not error on missing sections
assert isinstance(result, str)
def test_minimize_with_empty_sections(self):
"""Test that empty sections are handled properly."""
sections = {
"abstract": "",
"claims": "1. A method.",
"summary": "",
}
result = SERP.minimize_patent_for_llm(sections)
# Empty sections should not appear
assert result.count("CLAIMS:") == 1
assert "1. A method." in result
def test_minimize_separates_sections_with_double_newline(self):
"""Test that sections are properly separated."""
sections = {
"abstract": "Abstract text.",
"claims": "Claims text.",
"summary": "Summary text.",
}
result = SERP.minimize_patent_for_llm(sections)
# Sections should be separated by double newlines
assert "\n\n" in result