feat: patent retrival and semi-processed

2025-12-08 19:33:02 -05:00
parent b51f0596a3
commit 63a9889e5b
21 changed files with 301 additions and 54 deletions
@@ -1 +0,0 @@
-Get us some oreos!
@@ -1,6 +1,3 @@
 from .types import Patents, Patent

-all = [
-        "Patents",
-        "Patent"
-]
+all = ["Patents", "Patent"]
@@ -4,4 +4,3 @@ import os

 load_dotenv()
 api_key = os.getenv("API_KEY")
-
@@ -0,0 +1,81 @@
+import re
+import pdfplumber  # pip install pdfplumber
+from typing import Dict
+
+
+def parse_patent_pdf(pdf_path: str) -> Dict:
+    """Extract structured sections from patent PDF"""
+
+    with pdfplumber.open(pdf_path) as pdf:
+        # Extract all text
+        full_text = ""
+        for page in pdf.pages:
+            full_text += page.extract_text() + "\n"
+
+    # Define section patterns (common in patents)
+    sections = {
+        "abstract": extract_section(
+            full_text,
+            start_patterns=[r"ABSTRACT", r"Abstract of the Disclosure"],
+            end_patterns=[r"BACKGROUND", r"FIELD OF", r"BRIEF DESCRIPTION"],
+        ),
+        "claims": extract_section(
+            full_text,
+            start_patterns=[r"What is claimed is:", r"CLAIMS?:", r"I claim:"],
+            end_patterns=[r"ABSTRACT", r"\*\s*\*\s*\*", r"$"],  # Often at end
+        ),
+        "summary": extract_section(
+            full_text,
+            start_patterns=[r"SUMMARY OF THE INVENTION", r"SUMMARY"],
+            end_patterns=[r"BRIEF DESCRIPTION", r"DETAILED DESCRIPTION"],
+        ),
+        "description": extract_section(
+            full_text,
+            start_patterns=[
+                r"DETAILED DESCRIPTION",
+                r"DESCRIPTION OF THE PREFERRED EMBODIMENT",
+            ],
+            end_patterns=[r"What is claimed", r"CLAIMS?:", r"I claim:"],
+        ),
+    }
+
+    return sections
+
+
+def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
+    """Extract text between start and end patterns"""
+
+    # Find start position
+    start_pos = None
+    for pattern in start_patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            start_pos = match.end()
+            break
+
+    if start_pos is None:
+        return ""
+
+    # Find end position
+    end_pos = len(text)
+    for pattern in end_patterns:
+        match = re.search(pattern, text[start_pos:], re.IGNORECASE)
+        if match:
+            end_pos = start_pos + match.start()
+            break
+
+    # Extract and clean
+    section_text = text[start_pos:end_pos].strip()
+    return clean_patent_text(section_text)
+
+
+def clean_patent_text(text: str) -> str:
+    """Remove noise from extracted text"""
+    # Remove excessive whitespace
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    # Remove figure references
+    text = re.sub(r"\(see FIG\.\s*\d+[A-Z]?\)", "", text)
+    text = re.sub(r"FIG\.\s*\d+[A-Z]?", "", text)
+    # Remove line numbers (common in PDFs)
+    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
+    return text.strip()
@@ -1,8 +1,13 @@
 import serpapi
 from SPARC import config
+import re
+import pdfplumber  # pip install pdfplumber
+import requests
+from typing import Dict
+from SPARC.types import Patents, Patent

 class SERP:
-  def query(company: str):
+  def query(company: str) -> Patents:
    # Make API call
    params = {
      "engine": "google_patents",
@@ -17,6 +22,97 @@ class SERP:
    patent_ids = []
    list_of_patents = search["organic_results"]
    for patent in list_of_patents:
-        patent_ids.append(patent["publication_number"])
+        patent_ids.append(Patent(patent_id=patent["publication_number"], pdf_link=patent["pdf"], summary=None))
+
+    return Patents(patents=patent_ids)
+
+  def save_patents(patent: Patent) -> Patent:
+    """
+    Save the patent PDF to the patents folder
+    
+    Args:
+      patent: Patent object
+
+    Returns:
+      Patent object with updated PDF path
+    """
+    response = requests.get(patent.pdf_link)
+    print(patent.pdf_link)
+    with open(f"patents/{patent.patent_id}.pdf", "wb") as f:
+      f.write(response.content)
+    
+    patent.pdf_path = f"patents/{patent.patent_id}.pdf"
+    return patent
+
+  def parse_patent_pdf(pdf_path: str) -> Dict:
+    """Extract structured sections from patent PDF"""
+    
+    with pdfplumber.open(pdf_path) as pdf:
+      # Extract all text
+      full_text = ""
+      for page in pdf.pages:
+        full_text += page.extract_text() + "\n"
+    
+    # Define section patterns (common in patents)
+    sections = {
+      'abstract': SERP.extract_section(
+        full_text, 
+        start_patterns=[r'ABSTRACT', r'Abstract of the Disclosure'],
+        end_patterns=[r'BACKGROUND', r'FIELD OF', r'BRIEF DESCRIPTION']
+      ),
+      'claims': SERP.extract_section(
+        full_text,
+        start_patterns=[r'What is claimed is:', r'CLAIMS?:', r'I claim:'],
+        end_patterns=[r'ABSTRACT', r'\*\s*\*\s*\*', r'$']  # Often at end
+      ),
+      'summary': SERP.extract_section(
+        full_text,
+        start_patterns=[r'SUMMARY OF THE INVENTION', r'SUMMARY'],
+        end_patterns=[r'BRIEF DESCRIPTION', r'DETAILED DESCRIPTION']
+      ),
+      'description': SERP.extract_section(
+        full_text,
+        start_patterns=[r'DETAILED DESCRIPTION', r'DESCRIPTION OF THE PREFERRED EMBODIMENT'],
+        end_patterns=[r'What is claimed', r'CLAIMS?:', r'I claim:']
+      )
+    }
+    
+    return sections
+
+  def extract_section(text: str, start_patterns: list, end_patterns: list) -> str:
+    """Extract text between start and end patterns"""
+    
+    # Find start position
+    start_pos = None
+    for pattern in start_patterns:
+      match = re.search(pattern, text, re.IGNORECASE)
+      if match:
+        start_pos = match.end()
+        break
+    
+    if start_pos is None:
+      return ""
+    
+    # Find end position
+    end_pos = len(text)
+    for pattern in end_patterns:
+      match = re.search(pattern, text[start_pos:], re.IGNORECASE)
+      if match:
+        end_pos = start_pos + match.start()
+        break
+    
+    # Extract and clean
+    section_text = text[start_pos:end_pos].strip()
+    return SERP.clean_patent_text(section_text)
+
+  def clean_patent_text(text: str) -> str:
+    """Remove noise from extracted text"""
+    # Remove excessive whitespace
+    text = re.sub(r'\n\s*\n', '\n\n', text)
+    # Remove figure references
+    text = re.sub(r'\(see FIG\.\s*\d+[A-Z]?\)', '', text)
+    text = re.sub(r'FIG\.\s*\d+[A-Z]?', '', text)
+    # Remove line numbers (common in PDFs)
+    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
+    return text.strip()

-    return patent_ids
@@ -1,11 +1,14 @@
-from dataclass import dataclass
+from dataclasses import dataclass
+

@dataclass
 class Patent:
-  patent_id: int
-  pdf_link: str
-  summary: dict
+    patent_id: int
+    pdf_link: str
+    pdf_path: str | None = None
+    summary: dict | None = None
+

@dataclass
 class Patents:
-  patents: list[Patent]
+    patents: list[Patent]