SPARC/scripts/estimate_tokens.py

#!/usr/bin/env python3
"""Estimate token usage per company portfolio for SPARC analysis."""

import tiktoken
from typing import Dict, List, Optional
from dataclasses import dataclass


@dataclass
class TokenEstimate:
    """Token usage estimate for a company portfolio."""
    company_name: str
    patent_count: int
    prompt_tokens: int
    estimated_completion_tokens: int
    total_tokens: int
    cost_estimate_usd: float


class TokenEstimator:
    """Estimate token usage for SPARC patent analysis."""

    # Claude 3.5 Sonnet pricing via OpenRouter (per 1M tokens)
    INPUT_COST_PER_1M = 3.00   # $3.00 per 1M input tokens
    OUTPUT_COST_PER_1M = 15.00  # $15.00 per 1M output tokens

    # Estimated output tokens based on max_tokens settings
    SINGLE_PATENT_MAX_OUTPUT = 1024
    PORTFOLIO_MAX_OUTPUT = 2048

    def __init__(self):
        # Use cl100k_base encoding (closest to Claude's tokenizer)
        self.encoder = tiktoken.get_encoding("cl100k_base")

    def count_tokens(self, text: str) -> int:
        """Count tokens in a text string."""
        return len(self.encoder.encode(text))

    def build_single_patent_prompt(self, patent_content: str, company_name: str) -> str:
        """Build prompt for single patent analysis (matches llm.py)."""
        return f"""You are a patent analyst evaluating {company_name}'s innovation strategy.

Analyze the following patent content and provide insights on:
1. Innovation quality and novelty
2. Technical complexity and defensibility
3. Market potential and commercial viability
4. Strategic positioning relative to industry trends

Patent Content:
{patent_content}

Provide a concise analysis (2-3 paragraphs) focusing on what this patent reveals about the company's technical direction and competitive advantage."""

    def build_portfolio_prompt(self, patents_data: List[Dict[str, str]], company_name: str) -> str:
        """Build prompt for portfolio analysis (matches llm.py)."""
        portfolio_summary = []
        for idx, patent in enumerate(patents_data, 1):
            portfolio_summary.append(
                f"Patent {idx} ({patent['patent_id']}):\n{patent['content']}"
            )
        combined_content = "\n\n---\n\n".join(portfolio_summary)

        return f"""You are analyzing {company_name}'s patent portfolio to estimate their future performance and innovation trajectory.

You have {len(patents_data)} recent patents to analyze. Evaluate the portfolio holistically:

1. Innovation Trends: What technology areas are they focusing on?
2. Strategic Direction: What does this reveal about their business strategy?
3. Competitive Position: How defensible are these innovations?
4. Market Outlook: What market opportunities do these patents target?
5. Performance Forecast: Based on this innovation activity, what's your assessment of their likely performance?

Patent Portfolio:
{combined_content}

Provide a comprehensive analysis (4-5 paragraphs) with a final verdict on the company's innovation strength and performance outlook."""

    def estimate_portfolio(
        self,
        company_name: str,
        patents_data: List[Dict[str, str]],
        include_individual_patents: bool = False
    ) -> TokenEstimate:
        """Estimate tokens for a company portfolio analysis.

        Args:
            company_name: Name of the company
            patents_data: List of dicts with 'patent_id' and 'content' keys
            include_individual_patents: If True, also count individual patent analysis calls
        """
        # Portfolio analysis tokens
        portfolio_prompt = self.build_portfolio_prompt(patents_data, company_name)
        prompt_tokens = self.count_tokens(portfolio_prompt)
        completion_tokens = self.PORTFOLIO_MAX_OUTPUT

        # Optionally add individual patent analysis
        if include_individual_patents:
            for patent in patents_data:
                single_prompt = self.build_single_patent_prompt(patent['content'], company_name)
                prompt_tokens += self.count_tokens(single_prompt)
                completion_tokens += self.SINGLE_PATENT_MAX_OUTPUT

        total_tokens = prompt_tokens + completion_tokens

        # Calculate cost
        input_cost = (prompt_tokens / 1_000_000) * self.INPUT_COST_PER_1M
        output_cost = (completion_tokens / 1_000_000) * self.OUTPUT_COST_PER_1M
        total_cost = input_cost + output_cost

        return TokenEstimate(
            company_name=company_name,
            patent_count=len(patents_data),
            prompt_tokens=prompt_tokens,
            estimated_completion_tokens=completion_tokens,
            total_tokens=total_tokens,
            cost_estimate_usd=total_cost
        )

    def estimate_from_sample(
        self,
        company_name: str,
        patent_count: int = 10,
        avg_patent_chars: int = 5000
    ) -> TokenEstimate:
        """Estimate tokens using sample/average patent sizes.

        Args:
            company_name: Name of the company
            patent_count: Number of patents (default 10, typical from SERP)
            avg_patent_chars: Average characters per minimized patent content
        """
        # Generate sample patent data
        sample_content = "A" * avg_patent_chars  # Placeholder content
        patents_data = [
            {"patent_id": f"US{10000000 + i}", "content": sample_content}
            for i in range(patent_count)
        ]

        return self.estimate_portfolio(company_name, patents_data)


def main():
    """Run token estimation examples."""
    estimator = TokenEstimator()

    print("=" * 70)
    print("SPARC Token Usage Estimator")
    print("=" * 70)

    # Example 1: Estimate with sample data
    print("\n📊 Sample Estimates (10 patents, ~5000 chars each):\n")

    companies = ["Apple Inc.", "Microsoft Corporation", "Tesla Motors", "Google LLC"]

    total_tokens = 0
    total_cost = 0.0

    for company in companies:
        estimate = estimator.estimate_from_sample(company, patent_count=10, avg_patent_chars=5000)
        print(f"  {company}:")
        print(f"    Patents: {estimate.patent_count}")
        print(f"    Prompt tokens: {estimate.prompt_tokens:,}")
        print(f"    Est. completion tokens: {estimate.estimated_completion_tokens:,}")
        print(f"    Total tokens: {estimate.total_tokens:,}")
        print(f"    Est. cost: ${estimate.cost_estimate_usd:.4f}")
        print()

        total_tokens += estimate.total_tokens
        total_cost += estimate.cost_estimate_usd

    print("-" * 70)
    print(f"  TOTAL for {len(companies)} companies:")
    print(f"    Total tokens: {total_tokens:,}")
    print(f"    Total est. cost: ${total_cost:.4f}")

    # Example 2: Different portfolio sizes
    print("\n" + "=" * 70)
    print("📈 Token Scaling by Portfolio Size:")
    print("=" * 70 + "\n")

    for patent_count in [5, 10, 15, 20]:
        estimate = estimator.estimate_from_sample("Sample Corp", patent_count=patent_count)
        print(f"  {patent_count} patents: {estimate.prompt_tokens:,} prompt tokens, ${estimate.cost_estimate_usd:.4f}")

    # Example 3: With actual patent content (simulated)
    print("\n" + "=" * 70)
    print("📝 Example with Real Patent Structure:")
    print("=" * 70 + "\n")

    sample_patents = [
        {
            "patent_id": "US11234567",
            "content": """ABSTRACT: A method for machine learning optimization using gradient descent.

CLAIMS:
1. A computer-implemented method comprising:
   receiving input data;
   processing the input data through a neural network;
   optimizing weights using backpropagation.

SUMMARY: This invention relates to improvements in neural network training efficiency."""
        },
        {
            "patent_id": "US11234568",
            "content": """ABSTRACT: System for distributed computing in cloud environments.

CLAIMS:
1. A distributed system comprising:
   a plurality of compute nodes;
   a load balancer;
   a message queue for task distribution.

SUMMARY: The present disclosure improves cloud computing resource allocation."""
        }
    ]

    estimate = estimator.estimate_portfolio("Tech Corp", sample_patents)
    print(f"  Company: {estimate.company_name}")
    print(f"  Patents analyzed: {estimate.patent_count}")
    print(f"  Prompt tokens: {estimate.prompt_tokens:,}")
    print(f"  Est. completion: {estimate.estimated_completion_tokens:,}")
    print(f"  Total: {estimate.total_tokens:,}")
    print(f"  Est. cost: ${estimate.cost_estimate_usd:.4f}")


if __name__ == "__main__":
    main()