forked from 0xWheatyz/SPARC
feat: add token usage estimation utility
Add script to estimate token usage and costs for patent analysis. Uses tiktoken with cl100k_base encoding to approximate Claude's tokenizer. Includes cost calculations based on OpenRouter pricing and supports both sample-based and actual patent content estimation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,227 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Estimate token usage per company portfolio for SPARC analysis."""
|
||||||
|
|
||||||
|
import tiktoken
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TokenEstimate:
|
||||||
|
"""Token usage estimate for a company portfolio."""
|
||||||
|
company_name: str
|
||||||
|
patent_count: int
|
||||||
|
prompt_tokens: int
|
||||||
|
estimated_completion_tokens: int
|
||||||
|
total_tokens: int
|
||||||
|
cost_estimate_usd: float
|
||||||
|
|
||||||
|
|
||||||
|
class TokenEstimator:
|
||||||
|
"""Estimate token usage for SPARC patent analysis."""
|
||||||
|
|
||||||
|
# Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens)
|
||||||
|
INPUT_COST_PER_1M = 3.00 # $3.00 per 1M input tokens
|
||||||
|
OUTPUT_COST_PER_1M = 15.00 # $15.00 per 1M output tokens
|
||||||
|
|
||||||
|
# Estimated output tokens based on max_tokens settings
|
||||||
|
SINGLE_PATENT_MAX_OUTPUT = 1024
|
||||||
|
PORTFOLIO_MAX_OUTPUT = 2048
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Use cl100k_base encoding (closest to Claude's tokenizer)
|
||||||
|
self.encoder = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
"""Count tokens in a text string."""
|
||||||
|
return len(self.encoder.encode(text))
|
||||||
|
|
||||||
|
def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str:
|
||||||
|
"""Build prompt for single patent analysis (matches llm.py)."""
|
||||||
|
return f"""You are a patent analyst evaluating {company_name}'s innovation strategy.
|
||||||
|
|
||||||
|
Analyze the following patent content and provide insights on:
|
||||||
|
1. Innovation quality and novelty
|
||||||
|
2. Technical complexity and defensibility
|
||||||
|
3. Market potential and commercial viability
|
||||||
|
4. Strategic positioning relative to industry trends
|
||||||
|
|
||||||
|
Patent Content:
|
||||||
|
{patent_content}
|
||||||
|
|
||||||
|
Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
|
||||||
|
|
||||||
|
def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str:
|
||||||
|
"""Build prompt for portfolio analysis (matches llm.py)."""
|
||||||
|
portfolio_summary = []
|
||||||
|
for idx, patent in enumerate(patents_data, 1):
|
||||||
|
portfolio_summary.append(
|
||||||
|
f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}"
|
||||||
|
)
|
||||||
|
combined_content = "\n\n---\n\n".join(portfolio_summary)
|
||||||
|
|
||||||
|
return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory.
|
||||||
|
|
||||||
|
You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically:
|
||||||
|
|
||||||
|
1. Innovation Trends: What technology areas are they focusing on?
|
||||||
|
2. Strategic Direction: What does this reveal about their business strategy?
|
||||||
|
3. Competitive Position: How defensible are these innovations?
|
||||||
|
4. Market Outlook: What market opportunities do these patents target?
|
||||||
|
5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance?
|
||||||
|
|
||||||
|
Patent Portfolio:
|
||||||
|
{combined_content}
|
||||||
|
|
||||||
|
Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
|
||||||
|
|
||||||
|
def estimate_portfolio(
|
||||||
|
self,
|
||||||
|
company_name: str,
|
||||||
|
patents_data: List[Dict[str, str]],
|
||||||
|
include_individual_patents: bool = False
|
||||||
|
) -> TokenEstimate:
|
||||||
|
"""Estimate tokens for a company portfolio analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company_name: Name of the company
|
||||||
|
patents_data: List of dicts with 'patent_id' and 'content' keys
|
||||||
|
include_individual_patents: If True, also count individual patent analysis calls
|
||||||
|
"""
|
||||||
|
# Portfolio analysis tokens
|
||||||
|
portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name)
|
||||||
|
prompt_tokens = self.count_tokens(portfolio_prompt)
|
||||||
|
completion_tokens = self.PORTFOLIO_MAX_OUTPUT
|
||||||
|
|
||||||
|
# Optionally add individual patent analysis
|
||||||
|
if include_individual_patents:
|
||||||
|
for patent in patents_data:
|
||||||
|
single_prompt = self.build_single_patent_prompt(patent['content'], company_name)
|
||||||
|
prompt_tokens += self.count_tokens(single_prompt)
|
||||||
|
completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT
|
||||||
|
|
||||||
|
total_tokens = prompt_tokens + completion_tokens
|
||||||
|
|
||||||
|
# Calculate cost
|
||||||
|
input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M
|
||||||
|
output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M
|
||||||
|
total_cost = input_cost + output_cost
|
||||||
|
|
||||||
|
return TokenEstimate(
|
||||||
|
company_name=company_name,
|
||||||
|
patent_count=len(patents_data),
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
estimated_completion_tokens=completion_tokens,
|
||||||
|
total_tokens=total_tokens,
|
||||||
|
cost_estimate_usd=total_cost
|
||||||
|
)
|
||||||
|
|
||||||
|
def estimate_from_sample(
|
||||||
|
self,
|
||||||
|
company_name: str,
|
||||||
|
patent_count: int = 10,
|
||||||
|
avg_patent_chars: int = 5000
|
||||||
|
) -> TokenEstimate:
|
||||||
|
"""Estimate tokens using sample/average patent sizes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company_name: Name of the company
|
||||||
|
patent_count: Number of patents (default 10, typical from SERP)
|
||||||
|
avg_patent_chars: Average characters per minimized patent content
|
||||||
|
"""
|
||||||
|
# Generate sample patent data
|
||||||
|
sample_content = "A" * avg_patent_chars # Placeholder content
|
||||||
|
patents_data = [
|
||||||
|
{"patent_id": f"US{10000000 + i}", "content": sample_content}
|
||||||
|
for i in range(patent_count)
|
||||||
|
]
|
||||||
|
|
||||||
|
return self.estimate_portfolio(company_name, patents_data)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run token estimation examples."""
|
||||||
|
estimator = TokenEstimator()
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("SPARC Token Usage Estimator")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Example 1: Estimate with sample data
|
||||||
|
print("\n📊 Sample Estimates (10 patents, ~5000 chars each):\n")
|
||||||
|
|
||||||
|
companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"]
|
||||||
|
|
||||||
|
total_tokens = 0
|
||||||
|
total_cost = 0.0
|
||||||
|
|
||||||
|
for company in companies:
|
||||||
|
estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000)
|
||||||
|
print(f" {company}:")
|
||||||
|
print(f" Patents: {estimate.patent_count}")
|
||||||
|
print(f" Prompt tokens: {estimate.prompt_tokens:,}")
|
||||||
|
print(f" Est. completion tokens: {estimate.estimated_completion_tokens:,}")
|
||||||
|
print(f" Total tokens: {estimate.total_tokens:,}")
|
||||||
|
print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
total_tokens += estimate.total_tokens
|
||||||
|
total_cost += estimate.cost_estimate_usd
|
||||||
|
|
||||||
|
print("-" * 70)
|
||||||
|
print(f" TOTAL for {len(companies)} companies:")
|
||||||
|
print(f" Total tokens: {total_tokens:,}")
|
||||||
|
print(f" Total est. cost: ${total_cost:.4f}")
|
||||||
|
|
||||||
|
# Example 2: Different portfolio sizes
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("📈 Token Scaling by Portfolio Size:")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
for patent_count in [5, 10, 15, 20]:
|
||||||
|
estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count)
|
||||||
|
print(f" {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}")
|
||||||
|
|
||||||
|
# Example 3: With actual patent content (simulated)
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("📝 Example with Real Patent Structure:")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
sample_patents = [
|
||||||
|
{
|
||||||
|
"patent_id": "US11234567",
|
||||||
|
"content": """ABSTRACT: A method for machine learning optimization using gradient descent.
|
||||||
|
|
||||||
|
CLAIMS:
|
||||||
|
1. A computer-implemented method comprising:
|
||||||
|
receiving input data;
|
||||||
|
processing the input data through a neural network;
|
||||||
|
optimizing weights using backpropagation.
|
||||||
|
|
||||||
|
SUMMARY: This invention relates to improvements in neural network training efficiency."""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"patent_id": "US11234568",
|
||||||
|
"content": """ABSTRACT: System for distributed computing in cloud environments.
|
||||||
|
|
||||||
|
CLAIMS:
|
||||||
|
1. A distributed system comprising:
|
||||||
|
a plurality of compute nodes;
|
||||||
|
a load balancer;
|
||||||
|
a message queue for task distribution.
|
||||||
|
|
||||||
|
SUMMARY: The present disclosure improves cloud computing resource allocation."""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
estimate = estimator.estimate_portfolio("Tech Corp", sample_patents)
|
||||||
|
print(f" Company: {estimate.company_name}")
|
||||||
|
print(f" Patents analyzed: {estimate.patent_count}")
|
||||||
|
print(f" Prompt tokens: {estimate.prompt_tokens:,}")
|
||||||
|
print(f" Est. completion: {estimate.estimated_completion_tokens:,}")
|
||||||
|
print(f" Total: {estimate.total_tokens:,}")
|
||||||
|
print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user