From 5141d9dd47dd3dd9c948e9588e947168aa1a81ed Mon Sep 17 00:00:00 2001 From: 0xWheatyz Date: Fri, 13 Mar 2026 15:37:31 -0400 Subject: [PATCH] feat: add token usage estimation utility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add script to estimate token usage and costs for patent analysis. Uses tiktoken with cl100k_base encoding to approximate Claude's tokenizer. Includes cost calculations based on OpenRouter pricing and supports both sample-based and actual patent content estimation. šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- scripts/estimate_tokens.py | 227 +++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 scripts/estimate_tokens.py diff --git a/scripts/estimate_tokens.py b/scripts/estimate_tokens.py new file mode 100644 index 0000000..0f947e4 --- /dev/null +++ b/scripts/estimate_tokens.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +"""Estimate token usage per company portfolio for SPARC analysis.""" + +import tiktoken +from typing import Dict, List, Optional +from dataclasses import dataclass + + +@dataclass +class TokenEstimate: + """Token usage estimate for a company portfolio.""" + company_name: str + patent_count: int + prompt_tokens: int + estimated_completion_tokens: int + total_tokens: int + cost_estimate_usd: float + + +class TokenEstimator: + """Estimate token usage for SPARC patent analysis.""" + + # Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens) + INPUT_COST_PER_1M = 3.00 # $3.00 per 1M input tokens + OUTPUT_COST_PER_1M = 15.00 # $15.00 per 1M output tokens + + # Estimated output tokens based on max_tokens settings + SINGLE_PATENT_MAX_OUTPUT = 1024 + PORTFOLIO_MAX_OUTPUT = 2048 + + def __init__(self): + # Use cl100k_base encoding (closest to Claude's tokenizer) + self.encoder = tiktoken.get_encoding("cl100k_base") + + def count_tokens(self, text: str) -> int: + """Count tokens in a text string.""" + return len(self.encoder.encode(text)) + + def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str: + """Build prompt for single patent analysis (matches llm.py).""" + return f"""You are a patent analyst evaluating {company_name}'s innovation strategy. + +Analyze the following patent content and provide insights on: +1. Innovation quality and novelty +2. Technical complexity and defensibility +3. Market potential and commercial viability +4. Strategic positioning relative to industry trends + +Patent Content: +{patent_content} + +Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage.""" + + def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str: + """Build prompt for portfolio analysis (matches llm.py).""" + portfolio_summary = [] + for idx, patent in enumerate(patents_data, 1): + portfolio_summary.append( + f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}" + ) + combined_content = "\n\n---\n\n".join(portfolio_summary) + + return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory. + +You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically: + +1. Innovation Trends: What technology areas are they focusing on? +2. Strategic Direction: What does this reveal about their business strategy? +3. Competitive Position: How defensible are these innovations? +4. Market Outlook: What market opportunities do these patents target? +5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance? + +Patent Portfolio: +{combined_content} + +Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook.""" + + def estimate_portfolio( + self, + company_name: str, + patents_data: List[Dict[str, str]], + include_individual_patents: bool = False + ) -> TokenEstimate: + """Estimate tokens for a company portfolio analysis. + + Args: + company_name: Name of the company + patents_data: List of dicts with 'patent_id' and 'content' keys + include_individual_patents: If True, also count individual patent analysis calls + """ + # Portfolio analysis tokens + portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name) + prompt_tokens = self.count_tokens(portfolio_prompt) + completion_tokens = self.PORTFOLIO_MAX_OUTPUT + + # Optionally add individual patent analysis + if include_individual_patents: + for patent in patents_data: + single_prompt = self.build_single_patent_prompt(patent['content'], company_name) + prompt_tokens += self.count_tokens(single_prompt) + completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT + + total_tokens = prompt_tokens + completion_tokens + + # Calculate cost + input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M + output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M + total_cost = input_cost + output_cost + + return TokenEstimate( + company_name=company_name, + patent_count=len(patents_data), + prompt_tokens=prompt_tokens, + estimated_completion_tokens=completion_tokens, + total_tokens=total_tokens, + cost_estimate_usd=total_cost + ) + + def estimate_from_sample( + self, + company_name: str, + patent_count: int = 10, + avg_patent_chars: int = 5000 + ) -> TokenEstimate: + """Estimate tokens using sample/average patent sizes. + + Args: + company_name: Name of the company + patent_count: Number of patents (default 10, typical from SERP) + avg_patent_chars: Average characters per minimized patent content + """ + # Generate sample patent data + sample_content = "A" * avg_patent_chars # Placeholder content + patents_data = [ + {"patent_id": f"US{10000000 + i}", "content": sample_content} + for i in range(patent_count) + ] + + return self.estimate_portfolio(company_name, patents_data) + + +def main(): + """Run token estimation examples.""" + estimator = TokenEstimator() + + print("=" * 70) + print("SPARC Token Usage Estimator") + print("=" * 70) + + # Example 1: Estimate with sample data + print("\nšŸ“Š Sample Estimates (10 patents, ~5000 chars each):\n") + + companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"] + + total_tokens = 0 + total_cost = 0.0 + + for company in companies: + estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000) + print(f" {company}:") + print(f" Patents: {estimate.patent_count}") + print(f" Prompt tokens: {estimate.prompt_tokens:,}") + print(f" Est. completion tokens: {estimate.estimated_completion_tokens:,}") + print(f" Total tokens: {estimate.total_tokens:,}") + print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}") + print() + + total_tokens += estimate.total_tokens + total_cost += estimate.cost_estimate_usd + + print("-" * 70) + print(f" TOTAL for {len(companies)} companies:") + print(f" Total tokens: {total_tokens:,}") + print(f" Total est. cost: ${total_cost:.4f}") + + # Example 2: Different portfolio sizes + print("\n" + "=" * 70) + print("šŸ“ˆ Token Scaling by Portfolio Size:") + print("=" * 70 + "\n") + + for patent_count in [5, 10, 15, 20]: + estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count) + print(f" {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}") + + # Example 3: With actual patent content (simulated) + print("\n" + "=" * 70) + print("šŸ“ Example with Real Patent Structure:") + print("=" * 70 + "\n") + + sample_patents = [ + { + "patent_id": "US11234567", + "content": """ABSTRACT: A method for machine learning optimization using gradient descent. + +CLAIMS: +1. A computer-implemented method comprising: + receiving input data; + processing the input data through a neural network; + optimizing weights using backpropagation. + +SUMMARY: This invention relates to improvements in neural network training efficiency.""" + }, + { + "patent_id": "US11234568", + "content": """ABSTRACT: System for distributed computing in cloud environments. + +CLAIMS: +1. A distributed system comprising: + a plurality of compute nodes; + a load balancer; + a message queue for task distribution. + +SUMMARY: The present disclosure improves cloud computing resource allocation.""" + } + ] + + estimate = estimator.estimate_portfolio("Tech Corp", sample_patents) + print(f" Company: {estimate.company_name}") + print(f" Patents analyzed: {estimate.patent_count}") + print(f" Prompt tokens: {estimate.prompt_tokens:,}") + print(f" Est. completion: {estimate.estimated_completion_tokens:,}") + print(f" Total: {estimate.total_tokens:,}") + print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}") + + +if __name__ == "__main__": + main()