# -*- coding: utf-8 -*-
"""
RDF Generation Pipeline Orchestrator
=====================================
This script is the "conductor" for a 3-step data conversion process.
It asks the user all necessary questions up front, then runs each step
automatically without needing further input.

Folder structure (relative to project root skg/):
  skg/
    pipeline/          <- this script and all step scripts live here
    input/             <- supplemental TTL files go here (pel_supplemental_*.ttl)
    output/            <- all generated files are written here

What the pipeline produces:
  Raw database data -> Turtle RDF file -> Sorted -> Consolidated final file

The 3 steps (plus one in-between action) are:
  Step 1   : rdf_from_metadata.py
               Reads Oracle database tables and writes RDF triples to
               output/output.ttl
  Step 1.5 : (handled here, not a separate script)
               If a supplemental .ttl file exists in input/
               (named pel_supplemental_*.ttl), its content is appended to
               output/output.ttl before sorting begins.
  Step 2   : rdf_sort_by_class.py
               Reads output/output.ttl and writes output/output_sorted.ttl
               with all RDF entities sorted by subject IRI so that Step 3
               can merge duplicates.
  Step 3   : rdf_consolidate.py
               Reads output/output_sorted.ttl, merges consecutive duplicate
               subjects, and wraps the output in a named graph (TriG format),
               writing to output/output_consolidated.trig.

Author: M Dombaugh
Created: 2025-12-05
Version: 5.0 - Scripts resolve input/ and output/ relative to project root.
               Supplemental TTL is read from input/ folder.
               All generated files are written to output/ folder.
               Removed manual directory/filename prompts - paths are derived
               automatically from script location.
         4.0 - Added supplemental TTL append step between metadata generation
               and sort. Searches for pel_supplemental_*.ttl in the pipeline
               script directory and appends any found file to output.ttl
               before sorting.
"""

# ── Standard library imports ──────────────────────────────────────────────────
# glob      : finds files using wildcard patterns (e.g. pel_supplemental_*.ttl in input/)
# subprocess: launches other Python scripts as child processes
# sys       : lets us exit the program cleanly with an error code
# Path      : modern, cross-platform way to work with file and directory paths
# datetime  : used to print human-readable timestamps in the run log
# time      : used to measure how long each step takes
import glob
import subprocess
import sys
from pathlib import Path
from datetime import datetime
import time


# ── get_user_inputs() ─────────────────────────────────────────────────────────
def get_user_inputs():
    """
    Ask the user every question needed to run the full pipeline.

    All questions are asked HERE before any script is launched so the user
    can walk away and let the pipeline run unattended.

    Returns:
        inputs (dict): A dictionary holding all answers keyed by name.
                       Example keys: 'skip_download', 'graph_name', 'output_ttl'
    """
    print("="*70)
    print("TTL GENERATION PIPELINE - CONFIGURATION")
    print("="*70)
    print("\nEnter values for unattended execution:\n")

    # We'll store every answer in this dictionary and return it at the end.
    inputs = {}

    # ── Question 1: Skip the database download? ───────────────────────────────
    # Step 1 can be slow because it downloads Oracle tables to CSV first.
    # If the CSVs already exist from a previous run, the user can skip that
    # phase and go straight to generating the TTL from the existing files.
    inputs['skip_download'] = input(
        "Skip table download and use existing CSVs? (y/n): "
    ).strip().lower()

    # ── Question 2: Limit the number of subjects (rows) processed? ────────────
    # Useful for testing: set to 10 to get a quick sample instead of waiting
    # for all records. Set to 0 to process everything.
    inputs['max_subjects'] = (
        input("Max subjects to process per table (0 = all): ").strip() or "0"
    )

    # ── Question 3: Which graph/namespace to use? ─────────────────────────────
    # The graph name controls the namespace prefix (mt:) and the ontology IRI
    # that will appear at the top of the output file.
    # Each name corresponds to a different Multum product dataset.
    valid_graph_names = [
        "mulcor-enus",   # Multum Core - English US
        "mulivc-enus",   # Multum IV Compatibility - English US
        "mullpd-enus",   # Multum LPD - English US
        "mul3di-enus",   # Multum 3DI - English US
    ]

    print("\nSelect graph name:")
    for i, name in enumerate(valid_graph_names, 1):
        print(f"  {i}. {name}")

    # Keep asking until the user gives a valid number (1-4).
    while True:
        try:
            selection = int(input("Enter selection (1-4): ").strip())
            if 1 <= selection <= len(valid_graph_names):
                # Store both the numeric selection (passed to the child script
                # via stdin) and the human-readable name (used in log messages).
                inputs['graph_selection'] = str(selection)
                inputs['graph_name'] = valid_graph_names[selection - 1]
                print(f"Selected: {inputs['graph_name']}")
                break
            else:
                print("Invalid selection. Enter 1-4.")
        except ValueError:
            # int() raises ValueError if the user types something non-numeric.
            print("Invalid input. Enter a number 1-4.")

    # ── File and directory names (derived automatically from script location) ──
    # Scripts live in skg/pipeline/. Input and output folders are siblings of
    # pipeline/ at the project root level.
    #
    #   skg/
    #     pipeline/   <- SCRIPT_DIR
    #     input/      <- INPUT_DIR  (supplemental TTL files go here)
    #     output/     <- OUTPUT_DIR (all generated files written here)

    script_dir  = Path(__file__).resolve().parent
    project_root = script_dir.parent

    inputs['script_dir']  = str(script_dir)
    inputs['input_dir']   = str(project_root / "input")
    inputs['output_dir']  = str(project_root / "output")
    inputs['output_ttl']  = "output.ttl"
    inputs['sorted_ttl']  = "output_sorted.ttl"
    inputs['consolidated_ttl'] = "output_consolidated.ttl"

    return inputs


# ── append_supplemental_ttl() ─────────────────────────────────────────────────
def append_supplemental_ttl(output_ttl_path, input_dir):
    """
    Look for a supplemental TTL file in input_dir and, if found, append its
    content to the end of output_ttl_path.

    Why this exists:
        Some RDF triples are maintained manually (e.g. hand-crafted mappings
        that do not come from the database). Placing them in a file named
        pel_supplemental_*.ttl and running this pipeline will automatically
        merge them into the output before sorting and consolidation.

    File naming convention:
        The file must be in skg/input/ and
        must match the pattern:  pel_supplemental_*.ttl
        The '*' wildcard can be anything (e.g. pel_supplemental_drug_fixes.ttl).
        Only ONE such file is expected. If more than one is found, only the
        first alphabetical match is used and a warning is printed.

    Args:
        output_ttl_path (Path): Full path to the file we will append INTO.
        input_dir       (Path): skg/input/ - directory to search for the supplemental file.

    Returns:
        (appended, duration)
        appended  (bool) : True if a file was found and appended, else False.
        duration (float) : How many seconds this function took to run.
    """
    start_time = time.time()

    # Build the full wildcard search pattern by joining the directory and filename pattern.
    # Example: C:\Oracle\skg\input\pel_supplemental_*.ttl
    supplemental_glob = str(input_dir / "pel_supplemental_*.ttl")

    # glob.glob() returns a list of all file paths that match the pattern.
    # If no files match, it returns an empty list [].
    matches = glob.glob(supplemental_glob)

    # ── No file found: skip silently and continue ─────────────────────────────
    if not matches:
        print(f"[INFO] No supplemental file found matching: {supplemental_glob}")
        print("[INFO] Continuing without supplemental data.")
        return False, time.time() - start_time

    # ── More than one file found: warn and use only the first ─────────────────
    if len(matches) > 1:
        print("[WARNING] Multiple supplemental files found - using first match only:")
        for m in matches:
            print(f"  {m}")

    # Use the first (or only) match.
    supplemental_path = Path(matches[0])
    print(f"[INFO] Supplemental file found: {supplemental_path}")

    # ── Read the supplemental file ────────────────────────────────────────────
    # read_text() opens the file, reads all content as a string, then closes it.
    # We wrap it in try/except because the file could be locked, corrupted, etc.
    try:
        supplemental_text = supplemental_path.read_text(encoding="utf-8")
    except Exception as e:
        print(f"[ERROR] Could not read supplemental file: {e}")
        print("[ERROR] Pipeline aborted.")
        sys.exit(1)  # Exit with error code 1 (non-zero = failure)

    # ── Append the supplemental content to output.ttl ─────────────────────────
    # Open in "a" (append) mode so we ADD to the end of the file rather than
    # overwriting it. We write a blank line first so the last triple from
    # output.ttl and the first line of the supplemental file are not joined
    # on the same line, which would create invalid RDF syntax.
    try:
        with open(output_ttl_path, "a", encoding="utf-8") as f:
            f.write("\n")                # Blank separator line
            f.write(supplemental_text)   # All content from the supplemental file
    except Exception as e:
        print(f"[ERROR] Could not append to {output_ttl_path}: {e}")
        print("[ERROR] Pipeline aborted.")
        sys.exit(1)

    # Report how much data was added. The :, format adds comma separators
    # to large numbers (e.g. 1,234,567) for readability.
    char_count = len(supplemental_text)
    print(f"[INFO] Appended {char_count:,} characters from supplemental file to {output_ttl_path}")

    return True, time.time() - start_time


# ── run_script() ──────────────────────────────────────────────────────────────
def run_script(script_path, args=None, stdin_input=None, cwd=None):
    """
    Launch a Python script as a child process and wait for it to finish.

    This is how the pipeline "calls" each of the three main scripts. Using
    subprocess instead of import keeps each script isolated - if one crashes
    it does not take down this orchestrator unexpectedly.

    Args:
        script_path (Path)  : Full path to the .py file to run.
        args        (list)  : Optional list of command-line arguments to pass.
                              Example: ['sorted.ttl', 'consolidated.ttl']
        stdin_input (str)   : Optional text to feed to the script's input()
                              calls. Each answer must be separated by '\\n'.
                              Example: "y\\n0\\n2\\n"  (answers three prompts)
        cwd         (Path)  : Working directory for the child process.
                              The child script will see this as its "current
                              directory" when looking for files.

    Returns:
        (success, duration)
        success  (bool) : True if the script exited with code 0 (no errors).
        duration (float): How many seconds the script ran.
    """
    # Build the command list. sys.executable is the full path to the Python
    # interpreter currently running THIS script, ensuring both scripts use
    # the same Python environment and installed packages.
    # Example result: ['C:\\Python311\\python.exe', 'rdf_sort_by_class.py']
    cmd = [sys.executable, str(script_path)]

    # Append any extra command-line arguments if provided.
    if args:
        cmd.extend(args)

    start_time = time.time()

    try:
        if stdin_input:
            # subprocess.run() launches the command and waits for it to finish.
            # input=stdin_input feeds pre-typed answers to the child script's
            # input() calls, enabling fully unattended execution.
            # text=True means stdin_input is treated as a string (not bytes).
            result = subprocess.run(
                cmd,
                input=stdin_input,
                text=True,
                cwd=cwd
            )
        else:
            # No stdin to feed - child script either takes no input or gets
            # its arguments from the command line (args list above).
            result = subprocess.run(cmd, cwd=cwd)

        duration = time.time() - start_time

        # returncode 0 means the script finished successfully.
        # Any other value (1, 2, etc.) means something went wrong.
        return result.returncode == 0, duration

    except Exception as e:
        # This catches OS-level errors like "file not found" or "permission denied".
        print(f"[ERROR] {e}")
        return False, time.time() - start_time


# ── format_duration() ─────────────────────────────────────────────────────────
def format_duration(seconds):
    """
    Convert a raw number of seconds into a human-friendly string.

    Examples:
        45.3   ->  "45.3s"
        125    ->  "2m 5s"
        3725   ->  "1h 2m"

    Args:
        seconds (float): Elapsed time in seconds.

    Returns:
        str: Formatted duration string.
    """
    if seconds < 60:
        # Less than a minute: show seconds with one decimal place.
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        # Less than an hour: show minutes and whole seconds.
        return f"{int(seconds//60)}m {int(seconds%60)}s"
    else:
        # One hour or more: show hours and minutes (seconds not needed at this scale).
        return f"{int(seconds//3600)}h {int((seconds%3600)//60)}m"


# ── main() ────────────────────────────────────────────────────────────────────
def main():
    """
    Entry point for the pipeline. Orchestrates all steps in order.

    Flow:
        1. Ask the user all questions (get_user_inputs)
        2. Show a summary and ask for final confirmation
        3. Run Step 1: rdf_from_metadata.py
        4. Run Step 1.5: append supplemental TTL if present (append_supplemental_ttl)
        5. Run Step 2: rdf_sort_by_class.py
        6. Run Step 3: rdf_consolidate.py
        7. Print a timing summary
    """
    # ── Collect all user answers before starting any work ────────────────────
    inputs = get_user_inputs()

    # Resolve directory paths from the inputs dict.
    script_dir   = Path(inputs['script_dir']).resolve()
    input_dir    = Path(inputs['input_dir']).resolve()
    output_dir   = Path(inputs['output_dir']).resolve()

    # Pre-build the full path to the raw output file in output/.
    output_ttl_path = output_dir / inputs['output_ttl']

    # ── Define the pipeline steps as a list of dictionaries ──────────────────
    # Each dictionary describes one script to run:
    #   "name"   : Human-readable label shown in the progress output.
    #   "script" : Filename of the Python script to launch.
    #   "stdin"  : String of newline-separated answers to feed to that script's
    #              input() prompts. '\n' acts as pressing Enter.
    #   "args"   : Command-line arguments passed after the script name.
    #              None means no extra arguments.
    pipeline = [
        {
            # Step 1: Read Oracle DB metadata and export RDF triples to output.ttl
            "name": "Generate TTL from Metadata",
            "script": "rdf_from_metadata.py",
            # Feeds three answers to the child script's three input() prompts:
            #   1. skip_download  -> "y" or "n"
            #   2. max_subjects   -> "0" or a number
            #   3. graph_selection -> "1", "2", "3", or "4"
            "stdin": (
                f"{inputs['skip_download']}\n"
                f"{inputs['max_subjects']}\n"
                f"{inputs['graph_selection']}\n"
            ),
            "args": None
        },
        {
            # Step 2: Sort the output.ttl file by subject IRI.
            # Sorting is required so Step 3 can detect and merge consecutive
            # duplicate subjects efficiently.
            "name": "Sort TTL by Class",
            "script": "rdf_sort_by_class.py",
            # Feeds two answers: the input filename and the output filename.
            "stdin": f"{output_dir / inputs['output_ttl']}\n{output_dir / inputs['sorted_ttl']}\n",
            "args": None
        },
        {
            # Step 3: Merge duplicate consecutive subjects and wrap in a named graph.
            # This script takes its input/output filenames as command-line
            # arguments (not stdin), so stdin is None and args is populated.
            "name": "Consolidate TTL",
            "script": "rdf_consolidate.py",
            "stdin": None,
            "args": [str(output_dir / inputs['sorted_ttl']), str(output_dir / inputs['consolidated_ttl'])]
        }
    ]

    # ── Show a pre-run summary and ask for confirmation ───────────────────────
    print("\n" + "="*70)
    print("PIPELINE READY TO RUN")
    print("="*70)
    print(f"\nScript directory : {script_dir}")
    print(f"Input directory  : {input_dir}")
    print(f"Output directory : {output_dir}")
    print(f"Skip download: {inputs['skip_download']}")
    print(f"Max subjects: {inputs['max_subjects']}")
    print(f"Graph name: {inputs['graph_name']}")
    print(f"Supplemental pattern: pel_supplemental_*.ttl (in input/)")
    print(f"\nOutput files:")
    print(f"  Raw:          {output_dir / inputs['output_ttl']}")
    print(f"  Sorted:       {output_dir / inputs['sorted_ttl']}")
    print(f"  Consolidated: {output_dir / inputs['consolidated_ttl']}")
    print()

    # Give the user one last chance to cancel before committing to a long run.
    confirm = input("Start pipeline? (y/n): ").strip().lower()
    if confirm != 'y':
        print("Pipeline cancelled.")
        sys.exit(0)  # Exit code 0 = clean exit, not an error

    # ── Begin execution ───────────────────────────────────────────────────────
    print("\n" + "="*70)
    print("PIPELINE EXECUTION STARTED")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*70)

    total_start = time.time()   # Clock the entire pipeline for the final summary
    step_times = []             # Accumulate (step_name, duration) pairs for the summary

    # ── Loop through each pipeline step ──────────────────────────────────────
    # enumerate(pipeline, 1) gives us (1, step1), (2, step2), (3, step3)
    # so i is a 1-based step counter.
    for i, step in enumerate(pipeline, 1):

        # Build the full path to this step's script file.
        script_path = script_dir / step["script"]

        print(f"\n{'='*70}")
        print(f"STEP {i}/3: {step['name']}")
        print(f"Script: {step['script']}")
        print(f"{'='*70}\n")

        # Safety check: make sure the script file actually exists before trying
        # to run it. Missing scripts are a common setup error.
        if not script_path.exists():
            print(f"[ERROR] Script not found: {script_path}")
            sys.exit(1)

        # Launch the script and wait for it to complete.
        success, duration = run_script(
            script_path,
            args=step["args"],
            stdin_input=step["stdin"],
            cwd=script_dir          # Child script's working directory
        )

        # Record this step's timing for the final summary.
        step_times.append((step['name'], duration))

        # If the script reported failure, stop the pipeline immediately.
        # There is no point running Step 2 if Step 1 produced bad output.
        if not success:
            print(f"\n[ERROR] Step failed: {step['name']}")
            print("Pipeline aborted.")
            sys.exit(1)

        print(f"\n[COMPLETE] {step['name']} - {format_duration(duration)}")

        # ── Step 1.5: Supplemental TTL append ────────────────────────────────
        # This runs ONLY after Step 1 (i == 1) and BEFORE Step 2.
        # We check here instead of making it a separate pipeline list entry
        # because it is Python code (not a separate .py script to launch).
        if i == 1:
            print(f"\n{'='*70}")
            print("STEP 1.5: Append Supplemental TTL")
            print(f"{'='*70}\n")

            appended, supp_duration = append_supplemental_ttl(
                output_ttl_path, input_dir
            )

            # Choose a label that clearly describes what happened.
            label = (
                "Append Supplemental TTL (found)"
                if appended
                else "Append Supplemental TTL (none found)"
            )
            step_times.append((label, supp_duration))
            print(f"\n[COMPLETE] {label} - {format_duration(supp_duration)}")

    # ── Final summary ─────────────────────────────────────────────────────────
    total_duration = time.time() - total_start

    print("\n" + "="*70)
    print("PIPELINE COMPLETED SUCCESSFULLY")
    print("="*70)
    print(f"\nFinished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"\nStep timings:")
    for name, duration in step_times:
        print(f"  {name}: {format_duration(duration)}")
    print(f"\nTotal time: {format_duration(total_duration)}")
    print(f"\nFinal output: {output_dir / inputs['consolidated_ttl']}")
    print("="*70)


# ── Script entry point ────────────────────────────────────────────────────────
# This block only runs when the file is executed directly (e.g. python rdf_pipeline3.py).
# If another script were to import this file, main() would NOT be called automatically.
if __name__ == "__main__":
    main()