#!/usr/bin/env python3
"""
Fast TTL Class Extractor

Optimized for large TTL files (like 5.3 GB files).
Reads file in chunks and uses compiled regex for speed.
"""

import sys
import re
from pathlib import Path


def extract_mt_classes_fast(file_path, chunk_size=8192*1024):  # 8MB chunks
    """
    Extract all distinct mt: class names from a large TTL file efficiently.
    
    Args:
        file_path (str): Path to the TTL file
        chunk_size (int): Size of chunks to read at a time (default 8MB)
        
    Returns:
        dict: Dictionary mapping class names to their first line number
    """
    classes = {}  # Changed to dict to store line numbers
    
    # Compile regex once for better performance
    # Pattern captures everything after mt: until space, semicolon, or end of line
    pattern = re.compile(rb'\ba\s+mt:([^\s;]+)', re.IGNORECASE)
    
    try:
        with open(file_path, 'rb') as file:
            overlap = b""  # Handle patterns that span chunk boundaries
            chunk_num = 0
            current_line = 1
            
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                
                chunk_num += 1
                if chunk_num % 100 == 0:  # Progress indicator
                    print(f"Processed {chunk_num * chunk_size // (1024*1024)} MB... (line ~{current_line:,})")
                
                # Combine with overlap from previous chunk
                data = overlap + chunk
                
                # Split into lines to track line numbers
                lines = data.split(b'\n')
                
                # Process all complete lines (except the last which might be incomplete)
                for i, line in enumerate(lines[:-1]):
                    matches = pattern.findall(line)
                    for match in matches:
                        class_name = match.decode('utf-8', errors='ignore')
                        if class_name not in classes:  # Only store first occurrence
                            classes[class_name] = current_line + i
                
                # Update line counter
                current_line += len(lines) - 1
                
                # Keep last line as overlap for next iteration
                overlap = lines[-1] if lines else b""
                
            # Process final overlap
            if overlap:
                matches = pattern.findall(overlap)
                for match in matches:
                    class_name = match.decode('utf-8', errors='ignore')
                    if class_name not in classes:
                        classes[class_name] = current_line
                    
        return classes
        
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return set()
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
        return set()


def extract_mt_classes_streaming(file_path):
    """
    Alternative streaming approach - processes line by line.
    Good for files with consistent line structure.
    
    Returns:
        dict: Dictionary mapping class names to their first line number
    """
    classes = {}
    pattern = re.compile(r'\ba\s+mt:([^\s;]+)', re.IGNORECASE)
    
    try:
        line_count = 0
        with open(file_path, 'r', encoding='utf-8', buffering=8192*1024) as file:
            for line in file:
                line_count += 1
                if line_count % 1000000 == 0:  # Progress every million lines
                    print(f"Processed {line_count:,} lines...")
                
                matches = pattern.findall(line)
                for match in matches:
                    if match not in classes:  # Only store first occurrence
                        classes[match] = line_count
                    
        return classes
        
    except Exception as e:
        print(f"Error with streaming approach: {e}")
        return set()


def main():
    if len(sys.argv) < 2:
        print("Usage: python fast_ttl_extractor.py <turtle_file.ttl> [method]")
        print("Methods:")
        print("  chunk (default) - Process file in chunks (good for any file)")
        print("  stream          - Process line by line (faster for well-formed files)")
        print()
        print("Example: python fast_ttl_extractor.py output.ttl")
        print("Example: python fast_ttl_extractor.py output.ttl stream")
        sys.exit(1)
    
    file_path = sys.argv[1]
    method = sys.argv[2] if len(sys.argv) > 2 else "chunk"
    
    # Check if file exists
    path_obj = Path(file_path)
    if not path_obj.exists():
        print(f"Error: File '{file_path}' not found.")
        sys.exit(1)
    
    # Show file size
    file_size = path_obj.stat().st_size
    print(f"File: {file_path}")
    print(f"Size: {file_size / (1024*1024*1024):.2f} GB")
    print(f"Method: {method}")
    print("-" * 50)
    
    # Extract classes using chosen method
    if method == "stream":
        print("Using streaming method (line by line)...")
        classes = extract_mt_classes_streaming(file_path)
    else:
        print("Using chunk method (8MB chunks)...")
        classes = extract_mt_classes_fast(file_path)
    
    print("\nProcessing complete!")
    print("=" * 50)
    
    if classes:
        print(f"Found {len(classes)} distinct mt: classes:")
        print()
        
        # Sort by line number for logical ordering
        sorted_classes = sorted(classes.items(), key=lambda x: x[1])
        
        for class_name, line_num in sorted_classes:
            print(f"  mt:{class_name:<30} (first seen at line {line_num:,})")
            
        print(f"\nTotal: {len(classes)} classes")
        
    else:
        print("No mt: classes found in the file.")


if __name__ == "__main__":
    main()