From 5141d9dd47dd3dd9c948e9588e947168aa1a81ed Mon Sep 17 00:00:00 2001
From: 0xWheatyz <wyatt@leeworks.dev>
Date: Fri, 13 Mar 2026 15:37:31 -0400
Subject: [PATCH] feat: add token usage estimation utility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add script to estimate token usage and costs for patent analysis.
Uses tiktoken with cl100k_base encoding to approximate Claude's
tokenizer. Includes cost calculations based on OpenRouter pricing
and supports both sample-based and actual patent content estimation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 scripts/estimate_tokens.py | 227 +++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 scripts/estimate_tokens.py

diff --git a/scripts/estimate_tokens.py b/scripts/estimate_tokens.py
new file mode 100644
index 0000000..0f947e4
--- /dev/null
+++ b/scripts/estimate_tokens.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+"""Estimate token usage per company portfolio for SPARC analysis."""
+
+import tiktoken
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class TokenEstimate:
+    """Token usage estimate for a company portfolio."""
+    company_name: str
+    patent_count: int
+    prompt_tokens: int
+    estimated_completion_tokens: int
+    total_tokens: int
+    cost_estimate_usd: float
+
+
+class TokenEstimator:
+    """Estimate token usage for SPARC patent analysis."""
+
+    # Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens)
+    INPUT_COST_PER_1M = 3.00   # $3.00 per 1M input tokens
+    OUTPUT_COST_PER_1M = 15.00  # $15.00 per 1M output tokens
+
+    # Estimated output tokens based on max_tokens settings
+    SINGLE_PATENT_MAX_OUTPUT = 1024
+    PORTFOLIO_MAX_OUTPUT = 2048
+
+    def __init__(self):
+        # Use cl100k_base encoding (closest to Claude's tokenizer)
+        self.encoder = tiktoken.get_encoding("cl100k_base")
+
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in a text string."""
+        return len(self.encoder.encode(text))
+
+    def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str:
+        """Build prompt for single patent analysis (matches llm.py)."""
+        return f"""You are a patent analyst evaluating {company_name}'s innovation strategy.
+
+Analyze the following patent content and provide insights on:
+1. Innovation quality and novelty
+2. Technical complexity and defensibility
+3. Market potential and commercial viability
+4. Strategic positioning relative to industry trends
+
+Patent Content:
+{patent_content}
+
+Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
+
+    def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str:
+        """Build prompt for portfolio analysis (matches llm.py)."""
+        portfolio_summary = []
+        for idx, patent in enumerate(patents_data, 1):
+            portfolio_summary.append(
+                f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}"
+            )
+        combined_content = "\n\n---\n\n".join(portfolio_summary)
+
+        return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory.
+
+You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically:
+
+1. Innovation Trends: What technology areas are they focusing on?
+2. Strategic Direction: What does this reveal about their business strategy?
+3. Competitive Position: How defensible are these innovations?
+4. Market Outlook: What market opportunities do these patents target?
+5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance?
+
+Patent Portfolio:
+{combined_content}
+
+Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
+
+    def estimate_portfolio(
+        self,
+        company_name: str,
+        patents_data: List[Dict[str, str]],
+        include_individual_patents: bool = False
+    ) -> TokenEstimate:
+        """Estimate tokens for a company portfolio analysis.
+
+        Args:
+            company_name: Name of the company
+            patents_data: List of dicts with 'patent_id' and 'content' keys
+            include_individual_patents: If True, also count individual patent analysis calls
+        """
+        # Portfolio analysis tokens
+        portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name)
+        prompt_tokens = self.count_tokens(portfolio_prompt)
+        completion_tokens = self.PORTFOLIO_MAX_OUTPUT
+
+        # Optionally add individual patent analysis
+        if include_individual_patents:
+            for patent in patents_data:
+                single_prompt = self.build_single_patent_prompt(patent['content'], company_name)
+                prompt_tokens += self.count_tokens(single_prompt)
+                completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT
+
+        total_tokens = prompt_tokens + completion_tokens
+
+        # Calculate cost
+        input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M
+        output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M
+        total_cost = input_cost + output_cost
+
+        return TokenEstimate(
+            company_name=company_name,
+            patent_count=len(patents_data),
+            prompt_tokens=prompt_tokens,
+            estimated_completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            cost_estimate_usd=total_cost
+        )
+
+    def estimate_from_sample(
+        self,
+        company_name: str,
+        patent_count: int = 10,
+        avg_patent_chars: int = 5000
+    ) -> TokenEstimate:
+        """Estimate tokens using sample/average patent sizes.
+
+        Args:
+            company_name: Name of the company
+            patent_count: Number of patents (default 10, typical from SERP)
+            avg_patent_chars: Average characters per minimized patent content
+        """
+        # Generate sample patent data
+        sample_content = "A" * avg_patent_chars  # Placeholder content
+        patents_data = [
+            {"patent_id": f"US{10000000 + i}", "content": sample_content}
+            for i in range(patent_count)
+        ]
+
+        return self.estimate_portfolio(company_name, patents_data)
+
+
+def main():
+    """Run token estimation examples."""
+    estimator = TokenEstimator()
+
+    print("=" * 70)
+    print("SPARC Token Usage Estimator")
+    print("=" * 70)
+
+    # Example 1: Estimate with sample data
+    print("\n📊 Sample Estimates (10 patents, ~5000 chars each):\n")
+
+    companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"]
+
+    total_tokens = 0
+    total_cost = 0.0
+
+    for company in companies:
+        estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000)
+        print(f"  {company}:")
+        print(f"    Patents: {estimate.patent_count}")
+        print(f"    Prompt tokens: {estimate.prompt_tokens:,}")
+        print(f"    Est. completion tokens: {estimate.estimated_completion_tokens:,}")
+        print(f"    Total tokens: {estimate.total_tokens:,}")
+        print(f"    Est. cost: ${estimate.cost_estimate_usd:.4f}")
+        print()
+
+        total_tokens += estimate.total_tokens
+        total_cost += estimate.cost_estimate_usd
+
+    print("-" * 70)
+    print(f"  TOTAL for {len(companies)} companies:")
+    print(f"    Total tokens: {total_tokens:,}")
+    print(f"    Total est. cost: ${total_cost:.4f}")
+
+    # Example 2: Different portfolio sizes
+    print("\n" + "=" * 70)
+    print("📈 Token Scaling by Portfolio Size:")
+    print("=" * 70 + "\n")
+
+    for patent_count in [5, 10, 15, 20]:
+        estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count)
+        print(f"  {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}")
+
+    # Example 3: With actual patent content (simulated)
+    print("\n" + "=" * 70)
+    print("📝 Example with Real Patent Structure:")
+    print("=" * 70 + "\n")
+
+    sample_patents = [
+        {
+            "patent_id": "US11234567",
+            "content": """ABSTRACT: A method for machine learning optimization using gradient descent.
+
+CLAIMS:
+1. A computer-implemented method comprising:
+   receiving input data;
+   processing the input data through a neural network;
+   optimizing weights using backpropagation.
+
+SUMMARY: This invention relates to improvements in neural network training efficiency."""
+        },
+        {
+            "patent_id": "US11234568",
+            "content": """ABSTRACT: System for distributed computing in cloud environments.
+
+CLAIMS:
+1. A distributed system comprising:
+   a plurality of compute nodes;
+   a load balancer;
+   a message queue for task distribution.
+
+SUMMARY: The present disclosure improves cloud computing resource allocation."""
+        }
+    ]
+
+    estimate = estimator.estimate_portfolio("Tech Corp", sample_patents)
+    print(f"  Company: {estimate.company_name}")
+    print(f"  Patents analyzed: {estimate.patent_count}")
+    print(f"  Prompt tokens: {estimate.prompt_tokens:,}")
+    print(f"  Est. completion: {estimate.estimated_completion_tokens:,}")
+    print(f"  Total: {estimate.total_tokens:,}")
+    print(f"  Est. cost: ${estimate.cost_estimate_usd:.4f}")
+
+
+if __name__ == "__main__":
+    main()