feat: add token usage estimation utility

Add script to estimate token usage and costs for patent analysis. Uses tiktoken with cl100k_base encoding to approximate Claude's tokenizer. Includes cost calculations based on OpenRouter pricing and supports both sample-based and actual patent content estimation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-13 15:37:31 -04:00
parent 4e419166e8
commit 5141d9dd47
1 changed files with 227 additions and 0 deletions
@@ -0,0 +1,227 @@
 #!/usr/bin/env python3
 """Estimate token usage per company portfolio for SPARC analysis."""
 import tiktoken
 from typing import Dict, List, Optional
 from dataclasses import dataclass
@dataclass
 class TokenEstimate:
    """Token usage estimate for a company portfolio."""
    company_name: str
    patent_count: int
    prompt_tokens: int
    estimated_completion_tokens: int
    total_tokens: int
    cost_estimate_usd: float
 class TokenEstimator:
    """Estimate token usage for SPARC patent analysis."""
    # Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens)
    INPUT_COST_PER_1M = 3.00   # $3.00 per 1M input tokens
    OUTPUT_COST_PER_1M = 15.00  # $15.00 per 1M output tokens
    # Estimated output tokens based on max_tokens settings
    SINGLE_PATENT_MAX_OUTPUT = 1024
    PORTFOLIO_MAX_OUTPUT = 2048
    def __init__(self):
        # Use cl100k_base encoding (closest to Claude's tokenizer)
        self.encoder = tiktoken.get_encoding("cl100k_base")
    def count_tokens(self, text: str) -> int:
        """Count tokens in a text string."""
        return len(self.encoder.encode(text))
    def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str:
        """Build prompt for single patent analysis (matches llm.py)."""
        return f"""You are a patent analyst evaluating {company_name}'s innovation strategy.
 Analyze the following patent content and provide insights on:
 1. Innovation quality and novelty
 2. Technical complexity and defensibility
 3. Market potential and commercial viability
 4. Strategic positioning relative to industry trends
 Patent Content:
 {patent_content}
 Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""
    def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str:
        """Build prompt for portfolio analysis (matches llm.py)."""
        portfolio_summary = []
        for idx, patent in enumerate(patents_data, 1):
            portfolio_summary.append(
                f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}"
            )
        combined_content = "\n\n---\n\n".join(portfolio_summary)
        return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory.
 You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically:
 1. Innovation Trends: What technology areas are they focusing on?
 2. Strategic Direction: What does this reveal about their business strategy?
 3. Competitive Position: How defensible are these innovations?
 4. Market Outlook: What market opportunities do these patents target?
 5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance?
 Patent Portfolio:
 {combined_content}
 Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""
    def estimate_portfolio(
        self,
        company_name: str,
        patents_data: List[Dict[str, str]],
        include_individual_patents: bool = False
    ) -> TokenEstimate:
        """Estimate tokens for a company portfolio analysis.
        Args:
            company_name: Name of the company
            patents_data: List of dicts with 'patent_id' and 'content' keys
            include_individual_patents: If True, also count individual patent analysis calls
        """
        # Portfolio analysis tokens
        portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name)
        prompt_tokens = self.count_tokens(portfolio_prompt)
        completion_tokens = self.PORTFOLIO_MAX_OUTPUT
        # Optionally add individual patent analysis
        if include_individual_patents:
            for patent in patents_data:
                single_prompt = self.build_single_patent_prompt(patent['content'], company_name)
                prompt_tokens += self.count_tokens(single_prompt)
                completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT
        total_tokens = prompt_tokens + completion_tokens
        # Calculate cost
        input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M
        output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M
        total_cost = input_cost + output_cost
        return TokenEstimate(
            company_name=company_name,
            patent_count=len(patents_data),
            prompt_tokens=prompt_tokens,
            estimated_completion_tokens=completion_tokens,
            total_tokens=total_tokens,
            cost_estimate_usd=total_cost
        )
    def estimate_from_sample(
        self,
        company_name: str,
        patent_count: int = 10,
        avg_patent_chars: int = 5000
    ) -> TokenEstimate:
        """Estimate tokens using sample/average patent sizes.
        Args:
            company_name: Name of the company
            patent_count: Number of patents (default 10, typical from SERP)
            avg_patent_chars: Average characters per minimized patent content
        """
        # Generate sample patent data
        sample_content = "A" * avg_patent_chars  # Placeholder content
        patents_data = [
            {"patent_id": f"US{10000000 + i}", "content": sample_content}
            for i in range(patent_count)
        ]
        return self.estimate_portfolio(company_name, patents_data)
 def main():
    """Run token estimation examples."""
    estimator = TokenEstimator()
    print("=" * 70)
    print("SPARC Token Usage Estimator")
    print("=" * 70)
    # Example 1: Estimate with sample data
    print("\n📊 Sample Estimates (10 patents, ~5000 chars each):\n")
    companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"]
    total_tokens = 0
    total_cost = 0.0
    for company in companies:
        estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000)
        print(f"  {company}:")
        print(f"    Patents: {estimate.patent_count}")
        print(f"    Prompt tokens: {estimate.prompt_tokens:,}")
        print(f"    Est. completion tokens: {estimate.estimated_completion_tokens:,}")
        print(f"    Total tokens: {estimate.total_tokens:,}")
        print(f"    Est. cost: ${estimate.cost_estimate_usd:.4f}")
        print()
        total_tokens += estimate.total_tokens
        total_cost += estimate.cost_estimate_usd
    print("-" * 70)
    print(f"  TOTAL for {len(companies)} companies:")
    print(f"    Total tokens: {total_tokens:,}")
    print(f"    Total est. cost: ${total_cost:.4f}")
    # Example 2: Different portfolio sizes
    print("\n" + "=" * 70)
    print("📈 Token Scaling by Portfolio Size:")
    print("=" * 70 + "\n")
    for patent_count in [5, 10, 15, 20]:
        estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count)
        print(f"  {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}")
    # Example 3: With actual patent content (simulated)
    print("\n" + "=" * 70)
    print("📝 Example with Real Patent Structure:")
    print("=" * 70 + "\n")
    sample_patents = [
        {
            "patent_id": "US11234567",
            "content": """ABSTRACT: A method for machine learning optimization using gradient descent.
 CLAIMS:
 1. A computer-implemented method comprising:
   receiving input data;
   processing the input data through a neural network;
   optimizing weights using backpropagation.
 SUMMARY: This invention relates to improvements in neural network training efficiency."""
        },
        {
            "patent_id": "US11234568",
            "content": """ABSTRACT: System for distributed computing in cloud environments.
 CLAIMS:
 1. A distributed system comprising:
   a plurality of compute nodes;
   a load balancer;
   a message queue for task distribution.
 SUMMARY: The present disclosure improves cloud computing resource allocation."""
        }
    ]
    estimate = estimator.estimate_portfolio("Tech Corp", sample_patents)
    print(f"  Company: {estimate.company_name}")
    print(f"  Patents analyzed: {estimate.patent_count}")
    print(f"  Prompt tokens: {estimate.prompt_tokens:,}")
    print(f"  Est. completion: {estimate.estimated_completion_tokens:,}")
    print(f"  Total: {estimate.total_tokens:,}")
    print(f"  Est. cost: ${estimate.cost_estimate_usd:.4f}")
 if __name__ == "__main__":
    main()