Files
SPARC/scripts/estimate_tokens.py
0xWheatyz 5141d9dd47 feat: add token usage estimation utility
Add script to estimate token usage and costs for patent analysis.
Uses tiktoken with cl100k_base encoding to approximate Claude's
tokenizer. Includes cost calculations based on OpenRouter pricing
and supports both sample-based and actual patent content estimation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-13 15:37:31 -04:00

228 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""Estimate token usage per company portfolio for SPARC analysis."""
import tiktoken
from typing import Dict, List, Optional
from dataclasses import dataclass
@dataclass
class TokenEstimate:
"""Token usage estimate for a company portfolio."""
company_name: str
patent_count: int
prompt_tokens: int
estimated_completion_tokens: int
total_tokens: int
cost_estimate_usd: float
class TokenEstimator:
"""Estimate token usage for SPARC patent analysis."""
# Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens)
INPUT_COST_PER_1M = 3.00 # $3.00 per 1M input tokens
OUTPUT_COST_PER_1M = 15.00 # $15.00 per 1M output tokens
# Estimated output tokens based on max_tokens settings
SINGLE_PATENT_MAX_OUTPUT = 1024
PORTFOLIO_MAX_OUTPUT = 2048
def __init__(self):
# Use cl100k_base encoding (closest to Claude's tokenizer)
self.encoder = tiktoken.get_encoding("cl100k_base")
def count_tokens(self, text: str) -> int:
"""Count tokens in a text string."""
return len(self.encoder.encode(text))
def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str:
"""Build prompt for single patent analysis (matches llm.py)."""
return f"""You are a patent analyst evaluating {company_name}'s innovation strategy.
Analyze the following patent content and provide insights on:
1. Innovation quality and novelty
2. Technical complexity and defensibility
3. Market potential and commercial viability
4. Strategic positioning relative to industry trends
Patent Content:
{patent_content}
Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str:
"""Build prompt for portfolio analysis (matches llm.py)."""
portfolio_summary = []
for idx, patent in enumerate(patents_data, 1):
portfolio_summary.append(
f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}"
)
combined_content = "\n\n---\n\n".join(portfolio_summary)
return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory.
You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically:
1. Innovation Trends: What technology areas are they focusing on?
2. Strategic Direction: What does this reveal about their business strategy?
3. Competitive Position: How defensible are these innovations?
4. Market Outlook: What market opportunities do these patents target?
5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance?
Patent Portfolio:
{combined_content}
Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
def estimate_portfolio(
self,
company_name: str,
patents_data: List[Dict[str, str]],
include_individual_patents: bool = False
) -> TokenEstimate:
"""Estimate tokens for a company portfolio analysis.
Args:
company_name: Name of the company
patents_data: List of dicts with 'patent_id' and 'content' keys
include_individual_patents: If True, also count individual patent analysis calls
"""
# Portfolio analysis tokens
portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name)
prompt_tokens = self.count_tokens(portfolio_prompt)
completion_tokens = self.PORTFOLIO_MAX_OUTPUT
# Optionally add individual patent analysis
if include_individual_patents:
for patent in patents_data:
single_prompt = self.build_single_patent_prompt(patent['content'], company_name)
prompt_tokens += self.count_tokens(single_prompt)
completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT
total_tokens = prompt_tokens + completion_tokens
# Calculate cost
input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M
output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M
total_cost = input_cost + output_cost
return TokenEstimate(
company_name=company_name,
patent_count=len(patents_data),
prompt_tokens=prompt_tokens,
estimated_completion_tokens=completion_tokens,
total_tokens=total_tokens,
cost_estimate_usd=total_cost
)
def estimate_from_sample(
self,
company_name: str,
patent_count: int = 10,
avg_patent_chars: int = 5000
) -> TokenEstimate:
"""Estimate tokens using sample/average patent sizes.
Args:
company_name: Name of the company
patent_count: Number of patents (default 10, typical from SERP)
avg_patent_chars: Average characters per minimized patent content
"""
# Generate sample patent data
sample_content = "A" * avg_patent_chars # Placeholder content
patents_data = [
{"patent_id": f"US{10000000 + i}", "content": sample_content}
for i in range(patent_count)
]
return self.estimate_portfolio(company_name, patents_data)
def main():
"""Run token estimation examples."""
estimator = TokenEstimator()
print("=" * 70)
print("SPARC Token Usage Estimator")
print("=" * 70)
# Example 1: Estimate with sample data
print("\n📊 Sample Estimates (10 patents, ~5000 chars each):\n")
companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"]
total_tokens = 0
total_cost = 0.0
for company in companies:
estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000)
print(f" {company}:")
print(f" Patents: {estimate.patent_count}")
print(f" Prompt tokens: {estimate.prompt_tokens:,}")
print(f" Est. completion tokens: {estimate.estimated_completion_tokens:,}")
print(f" Total tokens: {estimate.total_tokens:,}")
print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}")
print()
total_tokens += estimate.total_tokens
total_cost += estimate.cost_estimate_usd
print("-" * 70)
print(f" TOTAL for {len(companies)} companies:")
print(f" Total tokens: {total_tokens:,}")
print(f" Total est. cost: ${total_cost:.4f}")
# Example 2: Different portfolio sizes
print("\n" + "=" * 70)
print("📈 Token Scaling by Portfolio Size:")
print("=" * 70 + "\n")
for patent_count in [5, 10, 15, 20]:
estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count)
print(f" {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}")
# Example 3: With actual patent content (simulated)
print("\n" + "=" * 70)
print("📝 Example with Real Patent Structure:")
print("=" * 70 + "\n")
sample_patents = [
{
"patent_id": "US11234567",
"content": """ABSTRACT: A method for machine learning optimization using gradient descent.
CLAIMS:
1. A computer-implemented method comprising:
receiving input data;
processing the input data through a neural network;
optimizing weights using backpropagation.
SUMMARY: This invention relates to improvements in neural network training efficiency."""
},
{
"patent_id": "US11234568",
"content": """ABSTRACT: System for distributed computing in cloud environments.
CLAIMS:
1. A distributed system comprising:
a plurality of compute nodes;
a load balancer;
a message queue for task distribution.
SUMMARY: The present disclosure improves cloud computing resource allocation."""
}
]
estimate = estimator.estimate_portfolio("Tech Corp", sample_patents)
print(f" Company: {estimate.company_name}")
print(f" Patents analyzed: {estimate.patent_count}")
print(f" Prompt tokens: {estimate.prompt_tokens:,}")
print(f" Est. completion: {estimate.estimated_completion_tokens:,}")
print(f" Total: {estimate.total_tokens:,}")
print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}")
if __name__ == "__main__":
main()