#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TTL Hybrid Duplicate Detector - Fast Text Parsing with Blank Node Awareness
===========================================================================
Uses fast text parsing but understands blank node semantics to avoid false positives.

This solves the problem where multiple blank nodes with the same predicate name
(e.g., mt:hasRelatedDrugNameGroup [) were incorrectly flagged as duplicates.

Usage:
    python detect_ttl_hybrid.py output.ttl [--verbose] [--fix] [--report]

Author: M Dombaugh
Version: 4.0 (Hybrid - Fast with Blank Node Awareness)
Date: 2025-11-10
"""

import re
import sys
from collections import defaultdict, Counter
from pathlib import Path
from datetime import datetime


class HybridDuplicateDetector:
    """Fast text parsing with semantic understanding of blank nodes."""
    
    def __init__(self, ttl_path):
        """Initialize detector with TTL file path."""
        self.ttl_path = Path(ttl_path)
        self.content = None
        self.content_lines = None
        
        # Duplicate tracking
        self.duplicate_predicates = defaultdict(list)  # subject -> [(pred, obj, count, positions), ...]
        self.multi_block_subjects = {}
        
        # Statistics
        self.stats = {
            'total_subjects': 0,
            'multi_block_subjects': 0,
            'subjects_with_duplicates': 0,
            'duplicate_predicate_instances': 0,
            'wasted_triples': 0,
        }
    
    def read_file(self):
        """Read the TTL file."""
        print(f"[INFO] Reading TTL file: {self.ttl_path}")
        
        if not self.ttl_path.exists():
            print(f"[ERROR] File not found: {self.ttl_path}")
            sys.exit(1)
        
        file_size = self.ttl_path.stat().st_size / (1024 * 1024)
        print(f"[INFO] File size: {file_size:.1f} MB")
        
        with open(self.ttl_path, 'r', encoding='utf-8') as f:
            self.content = f.read()
        
        print(f"[INFO] Loaded {len(self.content):,} characters")
    
    def _normalize_blank_node_content(self, text):
        """
        Normalize blank node content to compare semantically.
        Extracts the actual triples inside the blank node.
        """
        # Remove outer brackets and normalize whitespace
        text = text.strip()
        if text.startswith('[') and text.endswith(']'):
            text = text[1:-1].strip()
        
        # Split into lines and normalize
        lines = []
        for line in text.split('\n'):
            line = line.strip().rstrip(';,').strip()
            if line and not line.startswith('#'):
                # Normalize whitespace
                line = ' '.join(line.split())
                lines.append(line)
        
        # Sort lines for consistent comparison
        lines.sort()
        return '\n'.join(lines)
    
    def _extract_blank_node_full_content(self, text, start_pos):
        """
        Extract the full content of a blank node including nested brackets.
        Returns (full_blank_node_text, end_position)
        """
        # Find matching closing bracket
        bracket_count = 0
        i = start_pos
        
        while i < len(text):
            if text[i] == '[':
                bracket_count += 1
            elif text[i] == ']':
                bracket_count -= 1
                if bracket_count == 0:
                    return text[start_pos:i+1], i+1
            i += 1
        
        return text[start_pos:], len(text)
    
    def _parse_predicate_objects(self, block_text):
        """
        Parse predicate-object pairs from a block, handling blank nodes semantically.
        Returns: list of (predicate, normalized_object) tuples
        """
        pred_objects = []
        
        # Remove subject line and comment lines
        lines = block_text.split('\n')
        content_lines = []
        skip_subject = True
        
        for line in lines:
            line_stripped = line.strip()
            
            # Skip empty lines
            if not line_stripped:
                continue
            
            # Skip comment lines
            if line_stripped.startswith('#'):
                continue
            
            # Skip subject line (first line starting with <http)
            if skip_subject and line_stripped.startswith('<http'):
                skip_subject = False
                continue
            
            content_lines.append(line)
        
        if not content_lines:
            return pred_objects
        
        full_text = '\n'.join(content_lines)
        
        # Split by semicolons to get individual predicate statements
        # But need to be careful with nested structures
        i = 0
        current_statement = []
        bracket_depth = 0
        in_quotes = False
        quote_char = None
        
        while i < len(full_text):
            char = full_text[i]
            
            # Handle quote strings (don't split inside quotes)
            if char in ('"', "'") and (i == 0 or full_text[i-1] != '\\'):
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            
            if not in_quotes:
                if char == '[':
                    bracket_depth += 1
                elif char == ']':
                    bracket_depth -= 1
                elif char == ';' and bracket_depth == 0:
                    # End of statement
                    statement = ''.join(current_statement).strip()
                    if statement:
                        pred_obj = self._parse_single_statement(statement)
                        if pred_obj:
                            pred_objects.append(pred_obj)
                    current_statement = []
                    i += 1
                    continue
                elif char == '.' and bracket_depth == 0:
                    # End of block
                    statement = ''.join(current_statement).strip()
                    if statement:
                        pred_obj = self._parse_single_statement(statement)
                        if pred_obj:
                            pred_objects.append(pred_obj)
                    break
            
            current_statement.append(char)
            i += 1
        
        return pred_objects
    
    def _is_valid_predicate(self, predicate):
        """
        Check if a string is a valid RDF predicate.
        Valid predicates are:
        - URIs in angle brackets: <http://...>
        - Prefixed names: mt:hasDrug, skos:prefLabel, rdf:type
        - Single character 'a' (shorthand for rdf:type)
        """
        predicate = predicate.strip()
        
        # URI in angle brackets
        if predicate.startswith('<') and predicate.endswith('>'):
            return True
        
        # Shorthand for rdf:type
        if predicate == 'a':
            return True
        
        # Prefixed name (must contain colon and valid characters)
        if ':' in predicate:
            parts = predicate.split(':', 1)
            if len(parts) == 2:
                prefix, local = parts
                # Prefix must be alphanumeric/underscore
                # Local part can include alphanumeric, underscore, hyphen
                if prefix and local:
                    if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', prefix):
                        if re.match(r'^[a-zA-Z_][a-zA-Z0-9_\-]*$', local):
                            return True
        
        return False
    
    def _parse_single_statement(self, statement):
        """
        Parse a single predicate-object statement.
        Returns (predicate, normalized_object) or None
        """
        statement = statement.strip()
        if not statement:
            return None
        
        # Split into predicate and object more carefully
        # Handle: "predicate <http://uri>", "predicate literal", "predicate [blanknode]"
        
        # Find the predicate (first token)
        i = 0
        while i < len(statement) and not statement[i].isspace():
            i += 1
        
        if i >= len(statement):
            return None
        
        predicate = statement[:i].strip()
        obj = statement[i:].strip()
        
        if not predicate or not obj:
            return None
        
        # Validate predicate
        if not self._is_valid_predicate(predicate):
            return None
        
        # Normalize the object
        if obj.startswith('['):
            # Blank node - extract and normalize its content
            normalized_obj = self._normalize_blank_node_content(obj)
            return (predicate, f"[{normalized_obj}]")
        else:
            # Regular object (URI, literal, etc.)
            # For URIs in angle brackets, keep as-is
            # For literals, normalize whitespace
            if obj.startswith('<') and obj.endswith('>'):
                # URI - keep as-is, just strip
                normalized_obj = obj.strip()
            else:
                # Literal or other - normalize whitespace
                normalized_obj = ' '.join(obj.split())
            return (predicate, normalized_obj)
    
    def analyze(self):
        """Analyze the TTL file for duplicates."""
        print("[INFO] Parsing subject blocks...")
        
        # Split by subject blocks
        raw_blocks = re.split(r'\.\s*\n\s*\n', self.content)
        
        subject_blocks = defaultdict(list)
        block_count = 0
        
        for raw_block in raw_blocks:
            block_count += 1
            if block_count % 100000 == 0:
                print(f"[INFO] Processed {block_count:,} blocks...")
            
            raw_block = raw_block.strip()
            
            # Skip prefixes and empty blocks
            if not raw_block or raw_block.startswith('@prefix'):
                continue
            
            # Remove comment lines
            block_lines = []
            for line in raw_block.split('\n'):
                stripped = line.strip()
                if not stripped.startswith('#'):
                    block_lines.append(line)
            
            if not block_lines:
                continue
            
            raw_block = '\n'.join(block_lines)
            
            # Extract subject IRI
            subject_match = re.match(r'<([^>]+)>', raw_block)
            if not subject_match:
                continue
            
            subject_iri = f"<{subject_match.group(1)}>"
            
            # Add period back if missing
            if not raw_block.endswith('.'):
                raw_block += ' .'
            
            subject_blocks[subject_iri].append(raw_block)
        
        self.stats['total_subjects'] = len(subject_blocks)
        print(f"[INFO] Found {self.stats['total_subjects']:,} unique subjects")
        
        # Analyze each subject's blocks for duplicates
        print("[INFO] Analyzing for duplicate predicates within blocks...")
        
        processed = 0
        for subject_iri, blocks in subject_blocks.items():
            processed += 1
            if processed % 100000 == 0:
                print(f"[INFO] Analyzed {processed:,} subjects...")
            
            # Track multiple blocks for this subject
            if len(blocks) > 1:
                self.multi_block_subjects[subject_iri] = len(blocks)
            
            # Collect ALL predicate-objects across ALL blocks for this subject
            all_pred_objects = []
            
            # Check each block for duplicate predicates WITHIN the block
            for block in blocks:
                pred_objects = self._parse_predicate_objects(block)
                all_pred_objects.extend(pred_objects)
                
                # Count occurrences within this block
                pred_obj_counter = Counter(pred_objects)
                
                # Find duplicates within this block
                duplicates = [(pred, obj, count) for (pred, obj), count in pred_obj_counter.items() if count > 1]
                
                if duplicates:
                    if subject_iri not in self.duplicate_predicates:
                        self.stats['subjects_with_duplicates'] += 1
                    self.duplicate_predicates[subject_iri].extend(duplicates)
                    for pred, obj, count in duplicates:
                        self.stats['duplicate_predicate_instances'] += count
                        self.stats['wasted_triples'] += (count - 1)
            
            # For multi-block subjects, check for duplicates ACROSS blocks
            if len(blocks) > 1:
                # Count across all blocks
                all_counter = Counter(all_pred_objects)
                
                # Find duplicates across blocks (but not already counted within blocks)
                for (pred, obj), count in all_counter.items():
                    if count > 1:
                        # Check if this was already counted as within-block duplicate
                        already_counted = False
                        for block in blocks:
                            block_pred_objects = self._parse_predicate_objects(block)
                            block_count = block_pred_objects.count((pred, obj))
                            if block_count > 1:
                                already_counted = True
                                break
                        
                        if not already_counted:
                            # This is a cross-block duplicate
                            if subject_iri not in self.duplicate_predicates:
                                self.stats['subjects_with_duplicates'] += 1
                            self.duplicate_predicates[subject_iri].append((pred, obj, count))
                            self.stats['duplicate_predicate_instances'] += count
                            self.stats['wasted_triples'] += (count - 1)
        
        self.stats['multi_block_subjects'] = len(self.multi_block_subjects)
        
        print(f"[INFO] Analysis complete")
        
        return {
            'duplicate_predicates': dict(list(self.duplicate_predicates.items())[:20])
        }
    
    def print_report(self, results, verbose=False):
        """Print analysis report to console."""
        print("\n" + "="*80)
        print("HYBRID DUPLICATE ANALYSIS REPORT")
        print("="*80)
        print("Fast text parsing with blank node semantic awareness")
        
        # Statistics
        print("\n📊 STATISTICS:")
        print(f"  Total Subjects: {self.stats['total_subjects']:,}")
        print(f"  Multi-Block Subjects: {self.stats['multi_block_subjects']:,} (OK for Oracle RDF)")
        print(f"  Subjects with Duplicate Predicates: {self.stats['subjects_with_duplicates']:,}")
        print(f"  Duplicate Predicate Instances: {self.stats['duplicate_predicate_instances']:,}")
        print(f"  Wasted Triples: {self.stats['wasted_triples']:,}")
        
        # Duplicate predicates report
        if self.stats['wasted_triples'] > 0:
            print("\n⚠️  WASTEFUL: DUPLICATE PREDICATES FOUND")
            print("="*80)
            print("These predicates appear multiple times with the SAME object within a block.")
            print("Note: Multiple blank nodes with same predicate but DIFFERENT content are OK.\n")
            
            print(f"Showing top 20 subjects (out of {self.stats['subjects_with_duplicates']:,}):\n")
            
            for subject, duplicates in list(results['duplicate_predicates'].items())[:20]:
                print(f"\n🔴 Subject: {subject}")
                print(f"   Has {len(duplicates)} duplicate predicate-object pair(s):")
                
                for pred, obj, count in duplicates:
                    obj_display = obj if len(obj) <= 100 else obj[:100] + '...'
                    print(f"\n   • Predicate: {pred}")
                    print(f"     Object: {obj_display}")
                    print(f"     ⚠️  Appears: {count} times (wasting {count-1} triples)")
        
        else:
            print("\n✅ NO WASTEFUL DUPLICATES FOUND")
            print("="*80)
            print("File is clean - no duplicate predicates with identical objects.")
            print("Multiple blank nodes with same predicate name but different content: OK ✓")
        
        print("\n" + "="*80)
    
    def generate_html_report(self, results, output_path):
        """Generate HTML report."""
        print(f"[INFO] Generating HTML report: {output_path}")
        
        html = f"""<!DOCTYPE html>
<html>
<head>
    <title>Hybrid Duplicate Analysis Report</title>
    <style>
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 40px;
            background-color: #f5f5f5;
        }}
        .container {{
            max-width: 1400px;
            margin: 0 auto;
            background: white;
            padding: 40px;
            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
        }}
        h1 {{
            color: #2c3e50;
            border-bottom: 3px solid #3498db;
            padding-bottom: 10px;
        }}
        .info-box {{
            background: #e3f2fd;
            border-left: 4px solid #2196f3;
            padding: 15px;
            margin: 20px 0;
            border-radius: 4px;
        }}
        .stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin: 30px 0;
        }}
        .stat-card {{
            background: #ecf0f1;
            padding: 20px;
            border-radius: 8px;
            border-left: 4px solid #3498db;
        }}
        .stat-card.warning {{
            border-left-color: #e74c3c;
            background: #fadbd8;
        }}
        .stat-card.ok {{
            border-left-color: #27ae60;
            background: #d5f4e6;
        }}
        .stat-label {{
            font-size: 0.9em;
            color: #7f8c8d;
            margin-bottom: 5px;
        }}
        .stat-value {{
            font-size: 2em;
            font-weight: bold;
            color: #2c3e50;
        }}
        .stat-value.warning {{
            color: #c0392b;
        }}
        .duplicate {{
            background: #fadbd8;
            border-left: 4px solid #e74c3c;
            padding: 15px;
            margin: 15px 0;
            border-radius: 4px;
        }}
        .subject {{
            color: #2980b9;
            font-weight: bold;
        }}
        code {{
            background: #ecf0f1;
            padding: 2px 6px;
            border-radius: 3px;
            font-family: 'Courier New', monospace;
            font-size: 0.9em;
        }}
        .summary {{
            background: #e8f5e9;
            border: 2px solid #4caf50;
            padding: 20px;
            margin: 30px 0;
            border-radius: 8px;
            font-size: 1.1em;
        }}
        .summary.warning {{
            background: #fff3e0;
            border-color: #ff9800;
        }}
        ul {{
            margin: 10px 0;
        }}
        li {{
            margin: 10px 0;
            line-height: 1.6;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>🔍 Hybrid Duplicate Analysis Report</h1>
        <p><strong>File:</strong> {self.ttl_path.name}</p>
        <p><strong>Analysis Date:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <div class="info-box">
            <strong>Analysis Method:</strong> Hybrid (Fast Text Parsing + Blank Node Awareness)<br>
            <strong>Key Feature:</strong> Correctly distinguishes between duplicate predicates and multiple blank nodes with same predicate name but different content.
        </div>

        <div class="stats">
            <div class="stat-card">
                <div class="stat-label">Total Subjects</div>
                <div class="stat-value">{self.stats['total_subjects']:,}</div>
            </div>
            <div class="stat-card">
                <div class="stat-label">Multi-Block Subjects</div>
                <div class="stat-value">{self.stats['multi_block_subjects']:,}</div>
            </div>
            <div class="stat-card {'warning' if self.stats['subjects_with_duplicates'] > 0 else 'ok'}">
                <div class="stat-label">Subjects with Duplicates</div>
                <div class="stat-value {'warning' if self.stats['subjects_with_duplicates'] > 0 else ''}">{self.stats['subjects_with_duplicates']:,}</div>
            </div>
            <div class="stat-card {'warning' if self.stats['wasted_triples'] > 0 else 'ok'}">
                <div class="stat-label">Wasted Triples</div>
                <div class="stat-value {'warning' if self.stats['wasted_triples'] > 0 else ''}">{self.stats['wasted_triples']:,}</div>
            </div>
        </div>
"""
        
        if self.stats['wasted_triples'] > 0:
            html += f"""
        <h2>⚠️  Wasteful: Duplicate Predicates with Identical Objects</h2>
        <p>These predicates appear multiple times with the EXACT SAME object within a block.</p>
        <p><em>Note: Multiple blank nodes with same predicate but different content are correctly identified as NOT duplicates.</em></p>
        <p>Showing top 20 subjects (out of {self.stats['subjects_with_duplicates']:,} total)</p>
"""
            for subject, duplicates in list(results['duplicate_predicates'].items())[:20]:
                html += f"""
        <div class="duplicate">
            <strong class="subject">{subject}</strong>
            <p>Has {len(duplicates)} duplicate predicate-object pair(s):</p>
            <ul>
"""
                for pred, obj, count in duplicates:
                    obj_display = obj if len(obj) <= 200 else obj[:200] + '...'
                    html += f"""                <li>
                    <strong>Predicate:</strong> <code>{pred}</code><br>
                    <strong>Object:</strong> <code>{obj_display}</code><br>
                    <strong>Appears:</strong> {count} times <span style="color: #d9534f;">(wasting {count-1} triples)</span>
                </li>
"""
                html += "            </ul>\n        </div>\n"
            
            html += f"""
        <div class="summary warning">
            ⚠️ Found {self.stats['wasted_triples']:,} wasteful duplicate triples<br>
            → These should be removed to optimize the file
        </div>
"""
        else:
            html += """
        <div class="summary">
            ✅ NO WASTEFUL DUPLICATES FOUND<br>
            File is clean - no duplicate predicates with identical objects<br>
            ✓ Multiple blank nodes with same predicate but different content: Correctly identified as OK
        </div>
"""
        
        html += """
    </div>
</body>
</html>
"""
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)
        
        print(f"[SUCCESS] HTML report written to {output_path}")


def main():
    """Main execution."""
    if len(sys.argv) < 2:
        print("Usage: python detect_ttl_hybrid.py output.ttl [--verbose] [--fix] [--report]")
        print("\nHybrid Duplicate Detector - Fast with Blank Node Awareness")
        print("\nOptions:")
        print("  --verbose  Show detailed information")
        print("  --report   Generate HTML report")
        sys.exit(1)
    
    ttl_path = sys.argv[1]
    verbose = '--verbose' in sys.argv
    report = '--report' in sys.argv
    
    # Run analysis
    detector = HybridDuplicateDetector(ttl_path)
    detector.read_file()
    results = detector.analyze()
    
    # Print console report
    detector.print_report(results, verbose=verbose)
    
    # Generate HTML report if requested
    if report:
        report_path = "hybrid_duplicates_report.html"
        detector.generate_html_report(results, report_path)
    
    # Exit with error code if duplicates found
    sys.exit(1 if detector.stats['wasted_triples'] > 0 else 0)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n[INFO] Interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"[ERROR] Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)