#!/usr/bin/env python3 """Estimate token usage per company portfolio for SPARC analysis.""" import tiktoken from typing import Dict, List, Optional from dataclasses import dataclass @dataclass class TokenEstimate: """Token usage estimate for a company portfolio.""" company_name: str patent_count: int prompt_tokens: int estimated_completion_tokens: int total_tokens: int cost_estimate_usd: float class TokenEstimator: """Estimate token usage for SPARC patent analysis.""" # Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens) INPUT_COST_PER_1M = 3.00 # $3.00 per 1M input tokens OUTPUT_COST_PER_1M = 15.00 # $15.00 per 1M output tokens # Estimated output tokens based on max_tokens settings SINGLE_PATENT_MAX_OUTPUT = 1024 PORTFOLIO_MAX_OUTPUT = 2048 def __init__(self): # Use cl100k_base encoding (closest to Claude's tokenizer) self.encoder = tiktoken.get_encoding("cl100k_base") def count_tokens(self, text: str) -> int: """Count tokens in a text string.""" return len(self.encoder.encode(text)) def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str: """Build prompt for single patent analysis (matches llm.py).""" return f"""You are a patent analyst evaluating {company_name}'s innovation strategy. Analyze the following patent content and provide insights on: 1. Innovation quality and novelty 2. Technical complexity and defensibility 3. Market potential and commercial viability 4. Strategic positioning relative to industry trends Patent Content: {patent_content} Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage.""" def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str: """Build prompt for portfolio analysis (matches llm.py).""" portfolio_summary = [] for idx, patent in enumerate(patents_data, 1): portfolio_summary.append( f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}" ) combined_content = "\n\n---\n\n".join(portfolio_summary) return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory. You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically: 1. Innovation Trends: What technology areas are they focusing on? 2. Strategic Direction: What does this reveal about their business strategy? 3. Competitive Position: How defensible are these innovations? 4. Market Outlook: What market opportunities do these patents target? 5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance? Patent Portfolio: {combined_content} Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook.""" def estimate_portfolio( self, company_name: str, patents_data: List[Dict[str, str]], include_individual_patents: bool = False ) -> TokenEstimate: """Estimate tokens for a company portfolio analysis. Args: company_name: Name of the company patents_data: List of dicts with 'patent_id' and 'content' keys include_individual_patents: If True, also count individual patent analysis calls """ # Portfolio analysis tokens portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name) prompt_tokens = self.count_tokens(portfolio_prompt) completion_tokens = self.PORTFOLIO_MAX_OUTPUT # Optionally add individual patent analysis if include_individual_patents: for patent in patents_data: single_prompt = self.build_single_patent_prompt(patent['content'], company_name) prompt_tokens += self.count_tokens(single_prompt) completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT total_tokens = prompt_tokens + completion_tokens # Calculate cost input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M total_cost = input_cost + output_cost return TokenEstimate( company_name=company_name, patent_count=len(patents_data), prompt_tokens=prompt_tokens, estimated_completion_tokens=completion_tokens, total_tokens=total_tokens, cost_estimate_usd=total_cost ) def estimate_from_sample( self, company_name: str, patent_count: int = 10, avg_patent_chars: int = 5000 ) -> TokenEstimate: """Estimate tokens using sample/average patent sizes. Args: company_name: Name of the company patent_count: Number of patents (default 10, typical from SERP) avg_patent_chars: Average characters per minimized patent content """ # Generate sample patent data sample_content = "A" * avg_patent_chars # Placeholder content patents_data = [ {"patent_id": f"US{10000000 + i}", "content": sample_content} for i in range(patent_count) ] return self.estimate_portfolio(company_name, patents_data) def main(): """Run token estimation examples.""" estimator = TokenEstimator() print("=" * 70) print("SPARC Token Usage Estimator") print("=" * 70) # Example 1: Estimate with sample data print("\nšŸ“Š Sample Estimates (10 patents, ~5000 chars each):\n") companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"] total_tokens = 0 total_cost = 0.0 for company in companies: estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000) print(f" {company}:") print(f" Patents: {estimate.patent_count}") print(f" Prompt tokens: {estimate.prompt_tokens:,}") print(f" Est. completion tokens: {estimate.estimated_completion_tokens:,}") print(f" Total tokens: {estimate.total_tokens:,}") print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}") print() total_tokens += estimate.total_tokens total_cost += estimate.cost_estimate_usd print("-" * 70) print(f" TOTAL for {len(companies)} companies:") print(f" Total tokens: {total_tokens:,}") print(f" Total est. cost: ${total_cost:.4f}") # Example 2: Different portfolio sizes print("\n" + "=" * 70) print("šŸ“ˆ Token Scaling by Portfolio Size:") print("=" * 70 + "\n") for patent_count in [5, 10, 15, 20]: estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count) print(f" {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}") # Example 3: With actual patent content (simulated) print("\n" + "=" * 70) print("šŸ“ Example with Real Patent Structure:") print("=" * 70 + "\n") sample_patents = [ { "patent_id": "US11234567", "content": """ABSTRACT: A method for machine learning optimization using gradient descent. CLAIMS: 1. A computer-implemented method comprising: receiving input data; processing the input data through a neural network; optimizing weights using backpropagation. SUMMARY: This invention relates to improvements in neural network training efficiency.""" }, { "patent_id": "US11234568", "content": """ABSTRACT: System for distributed computing in cloud environments. CLAIMS: 1. A distributed system comprising: a plurality of compute nodes; a load balancer; a message queue for task distribution. SUMMARY: The present disclosure improves cloud computing resource allocation.""" } ] estimate = estimator.estimate_portfolio("Tech Corp", sample_patents) print(f" Company: {estimate.company_name}") print(f" Patents analyzed: {estimate.patent_count}") print(f" Prompt tokens: {estimate.prompt_tokens:,}") print(f" Est. completion: {estimate.estimated_completion_tokens:,}") print(f" Total: {estimate.total_tokens:,}") print(f" Est. cost: ${estimate.cost_estimate_usd:.4f}") if __name__ == "__main__": main()