diff --git a/SPARC/serp_api.py b/SPARC/serp_api.py index 3caa565..05120e4 100644 --- a/SPARC/serp_api.py +++ b/SPARC/serp_api.py @@ -45,18 +45,28 @@ class SERP: return patent def parse_patent_pdf(pdf_path: str) -> Dict: - """Extract structured sections from patent PDF""" - + """Extract structured sections from patent PDF. + + Extracts all major sections from a patent PDF including abstract, + claims, summary, and detailed description. + + Args: + pdf_path: Path to the patent PDF file + + Returns: + Dictionary containing all extracted sections + """ + with pdfplumber.open(pdf_path) as pdf: # Extract all text full_text = "" for page in pdf.pages: full_text += page.extract_text() + "\n" - + # Define section patterns (common in patents) sections = { 'abstract': SERP.extract_section( - full_text, + full_text, start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'], end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION'] ), @@ -76,9 +86,41 @@ class SERP: end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:'] ) } - + return sections + def minimize_patent_for_llm(sections: Dict) -> str: + """Minimize patent content for LLM consumption. + + Removes bloated sections (detailed description) and keeps only + essential information: abstract, claims, and summary. This reduces + token usage while preserving the core innovation details. + + Args: + sections: Dictionary of parsed patent sections from parse_patent_pdf() + + Returns: + Concatenated string of essential patent sections ready for LLM analysis + """ + essential_parts = [] + + # Abstract: Concise overview of the invention + if sections.get('abstract'): + essential_parts.append("ABSTRACT:\n" + sections['abstract']) + + # Claims: The actual legal claims defining the invention (most important) + if sections.get('claims'): + essential_parts.append("CLAIMS:\n" + sections['claims']) + + # Summary: High-level description of the invention + if sections.get('summary'): + essential_parts.append("SUMMARY:\n" + sections['summary']) + + # Explicitly exclude 'description' - it's too verbose and contains + # implementation details not needed for high-level analysis + + return "\n\n".join(essential_parts) + def extract_section(text: str, start_patterns: list, end_patterns: list) -> str: """Extract text between start and end patterns""" diff --git a/tests/test_serp_api.py b/tests/test_serp_api.py index 42ee7c2..0454d58 100644 --- a/tests/test_serp_api.py +++ b/tests/test_serp_api.py @@ -102,3 +102,77 @@ class TestSectionExtraction: end_patterns=[r"BACKGROUND"], ) assert "This is the abstract in lowercase" in result + + +class TestPatentMinimization: + """Test patent content minimization for LLM consumption.""" + + def test_minimize_includes_all_essential_sections(self): + """Test that all essential sections are included in minimized output.""" + sections = { + "abstract": "This is the abstract.", + "claims": "1. A method for doing X.", + "summary": "This invention relates to X.", + "description": "Very long detailed description...", + } + result = SERP.minimize_patent_for_llm(sections) + + assert "ABSTRACT:" in result + assert "This is the abstract." in result + assert "CLAIMS:" in result + assert "1. A method for doing X." in result + assert "SUMMARY:" in result + assert "This invention relates to X." in result + + def test_minimize_excludes_description(self): + """Test that detailed description is excluded from minimized output.""" + sections = { + "abstract": "This is the abstract.", + "claims": "1. A method for doing X.", + "summary": "This invention relates to X.", + "description": "Very long detailed description that should be excluded.", + } + result = SERP.minimize_patent_for_llm(sections) + + assert "Very long detailed description" not in result + assert "DESCRIPTION:" not in result + + def test_minimize_handles_missing_sections(self): + """Test that minimization handles missing sections gracefully.""" + sections = { + "abstract": "This is the abstract.", + # claims missing + # summary missing + "description": "Description text.", + } + result = SERP.minimize_patent_for_llm(sections) + + assert "ABSTRACT:" in result + assert "This is the abstract." in result + # Should not error on missing sections + assert isinstance(result, str) + + def test_minimize_with_empty_sections(self): + """Test that empty sections are handled properly.""" + sections = { + "abstract": "", + "claims": "1. A method.", + "summary": "", + } + result = SERP.minimize_patent_for_llm(sections) + + # Empty sections should not appear + assert result.count("CLAIMS:") == 1 + assert "1. A method." in result + + def test_minimize_separates_sections_with_double_newline(self): + """Test that sections are properly separated.""" + sections = { + "abstract": "Abstract text.", + "claims": "Claims text.", + "summary": "Summary text.", + } + result = SERP.minimize_patent_for_llm(sections) + + # Sections should be separated by double newlines + assert "\n\n" in result