#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import sys
import re
import json
from pathlib import Path
from datetime import datetime

import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

# ---- Global plot style (same as your PI script) ----
plt.rcParams.update({
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "legend.fontsize": 10,
})

# ---------- Config ----------
NODE_TO_HOP_DEFAULT = {
    "apu00": 0, "apu01": 1, "apu02": 2, "apu03": 3,
    "apu04": 4, "apu09": 5, "apu14": 6, "apu19": 7, "apu24": 8,
}
RES_COL = "clean_absP95_ns"               # keep identical
ANALYSIS_SUBPATH = Path("_analysis") / "sarb_clean_nodes.csv"
# ----------------------------

# --- Parsers tuned for LINREG folder names ---
def parse_transport(run_id: str):
    # examples: ...__BC__BC_E2E_LINREG_E2E_... or ...BC_P2P_LINREG_P2P_...
    if "BC_E2E_LINREG" in run_id or re.search(r"\bE2E\b", run_id):
        return "E2E"
    if "BC_P2P_LINREG" in run_id or re.search(r"\bP2P\b", run_id):
        return "P2P"
    return None

def parse_sync_pow(run_id: str):
    # matches: _sync0_, _sync1_, _syncm3_
    m = re.search(r"_sync(m?\d)_", run_id)
    if not m:
        return None
    token = m.group(1)
    return -int(token[1:]) if token.startswith("m") else int(token)

def parse_lwin(run_id: str):
    # matches: lwin64, lwin128
    m = re.search(r"lwin(\d+)", run_id)
    return int(m.group(1)) if m else None

def path_has_excluded(path: Path, excluded_names: set) -> bool:
    lowered = {p.lower() for p in path.parts}
    return any(name.lower() in lowered for name in excluded_names)

def load_all_sarb_clean(sweep_dir: Path, exclude_dirs: set) -> pd.DataFrame:
    records = []
    for csv_path in sweep_dir.rglob(ANALYSIS_SUBPATH.as_posix()):
        if path_has_excluded(csv_path, exclude_dirs) or path_has_excluded(csv_path.parent, exclude_dirs):
            continue
        run_dir = csv_path.parent.parent  # .../<run_id>/_analysis/sarb_clean_nodes.csv
        run_id = run_dir.name
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"[WARN] Failed to read {csv_path}: {e}", file=sys.stderr)
            continue

        if "node" not in df.columns or RES_COL not in df.columns:
            print(f"[WARN] Missing 'node' or '{RES_COL}' in {csv_path}, skipping.", file=sys.stderr)
            continue

        df["run_id"] = run_id
        df["transport"] = parse_transport(run_id)
        df["sync_pow"]  = parse_sync_pow(run_id)
        df["lwin"]      = parse_lwin(run_id)
        records.append(df)

    if not records:
        raise RuntimeError("No sarb_clean_nodes.csv files found (after exclusions).")
    return pd.concat(records, ignore_index=True)

def map_nodes_to_hops(df: pd.DataFrame, node_to_hop: dict) -> pd.DataFrame:
    df = df.copy()
    df["hop"] = df["node"].map(node_to_hop)
    df = df[df["hop"].notna()].copy()
    df["hop"] = df["hop"].astype(int)
    return df

def spaghetti_all_linreg(out_dir: Path, avg_df: pd.DataFrame, threshold_ns: int):
    fig = plt.figure(figsize=(10, 6))

    # All parameter combos across transport × sync_pow × lwin
    # Keep only survivors with hop-8 <= threshold
    kept_curves = []
    last = avg_df[avg_df["hop"] == 8].copy()

    for (transport, sync_pow, lwin), g in avg_df.groupby(["transport", "sync_pow", "lwin"]):
        if transport is None or sync_pow is None or lwin is None:
            continue
        hop8 = last[(last["transport"] == transport) &
                    (last["sync_pow"] == sync_pow) &
                    (last["lwin"] == lwin)]
        if hop8.empty or pd.isna(hop8[RES_COL].iloc[0]):
            continue
        hop8_val = float(hop8[RES_COL].iloc[0])
        if hop8_val > threshold_ns:
            continue
        kept_curves.append(((transport, int(sync_pow), int(lwin)), hop8_val, g.sort_values("hop")))

    if not kept_curves:
        plt.text(0.5, 0.5, f"No curves ≤ {threshold_ns} ns at hop 8", ha="center", va="center", fontsize=12)
        plt.title("LINREG — SARB Clean P95")
        plt.axis("off")
        fig.savefig(out_dir / "spaghetti_ALL_LINREG.png", dpi=200, bbox_inches="tight")
        plt.close(fig)
        return None, None

    # Best = minimal hop8 residual among survivors
    best_key, best_val, _ = min(kept_curves, key=lambda t: t[1])

    kept_count = 0
    for (transport, sync_pow, lwin), hop8_val, g_sorted in kept_curves:
        kept_count += 1
        tau_sec = (2.0 ** float(sync_pow))
        
        # label = f"{transport}, 2^{sync_pow}, lwin{lwin} @8={hop8_val:.0f} ns"
        
        label = (
            f"{transport}, $\\tau_{{{'sync'}}}$={tau_sec:g} s "
            # f"lwin{lwin} @8={hop8_val:.0f} ns"
            f"@8={hop8_val:.0f} ns"
        )
        
        if (transport, sync_pow, lwin) == best_key:
            plt.plot(g_sorted["hop"], g_sorted[RES_COL],
                     linestyle=":", linewidth=3, label=f"BEST {label}")
        else:
            plt.plot(g_sorted["hop"], g_sorted[RES_COL], alpha=0.9, label=label)

    plt.xlabel("Hop count (1–8)")
    plt.ylabel("Clean Abs P95 (ns)")
    plt.grid(True)
    ncol = 2 if kept_count > 10 else 1
    plt.legend(ncol=ncol, framealpha=0.9)
    plt.gca().yaxis.set_major_locator(MultipleLocator(250))

    # Match your tick density
    ax = plt.gca()
    # ax.yaxis.set_major_locator(MultipleLocator(500))

    # plt.title(f"LINREG — SARB Clean P95 by hop (≤{threshold_ns/1000:.1f} µs filter) | kept {kept_count}")

    fig.savefig(out_dir / "spaghetti_ALL_LINREG.png", dpi=200, bbox_inches="tight")
    plt.close(fig)
    return best_key, best_val

def main():
    parser = argparse.ArgumentParser(description="Analyze SARB clean LINREG sweep (all-in-one figure).")
    parser.add_argument("sweep_dir", type=Path, help="Path to the unzipped sweep folder.")
    parser.add_argument("--threshold-ns", type=int, default=10_000,
                        help="Residual threshold at hop 8 (ns) for survivor curves. Default 10000 (10 µs).")
    parser.add_argument("--hop-map-json", type=Path, default=None,
                        help="Optional path to a JSON mapping node->hop.")
    parser.add_argument("--exclude-dir", action="append", default=["unused"],
                        help="Folder name to exclude (can be given multiple times). Default: unused")
    args = parser.parse_args()

    sweep_dir = args.sweep_dir.resolve()
    if not sweep_dir.exists():
        print(f"ERROR: sweep_dir '{sweep_dir}' does not exist.", file=sys.stderr)
        sys.exit(1)

    out_dir = sweep_dir / "sweep_analysis"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Hop map
    node_to_hop = NODE_TO_HOP_DEFAULT
    if args.hop_map_json:
        try:
            node_to_hop = json.loads(Path(args.hop_map_json).read_text())
        except Exception as e:
            print(f"[WARN] Failed to read hop map JSON, using default mapping: {e}", file=sys.stderr)

    exclude_set = set(args.exclude_dir or [])

    # Logging
    log_path = out_dir / "analysis_log.txt"
    with log_path.open("w") as log:
        log.write(f"Analysis started: {datetime.now().isoformat()}\n")
        log.write(f"Sweep dir: {sweep_dir}\n")
        log.write(f"Threshold (ns): {args.threshold_ns}\n")
        log.write(f"Hop map: {node_to_hop}\n")
        log.write(f"Excluded dirs: {sorted(exclude_set)}\n")

    # 1) Load & filter
    df = load_all_sarb_clean(sweep_dir, exclude_set)

    # 2) Hop mapping (and drop hop 0)
    df = map_nodes_to_hops(df, node_to_hop)
    df = df[df["hop"].between(1, 8)]

    # 3) Persist combined rows (for traceability)
    combined_csv = out_dir / "combined_sarb_clean_linreg.csv"
    df.to_csv(combined_csv, index=False)

    # 4) Average per (transport, sync_pow, lwin, hop)
    avg = (
        df.groupby(["transport", "sync_pow", "lwin", "hop"], dropna=False)[RES_COL]
        .mean()
        .reset_index()
    )

    # 5) Single-figure spaghetti & summary
    best_key, best_val = spaghetti_all_linreg(out_dir, avg, args.threshold_ns)

    # Build a compact summary (all survivors + best)
    survivors_rows = []
    last = avg[avg["hop"] == 8]
    for (transport, sync_pow, lwin), g in avg.groupby(["transport", "sync_pow", "lwin"]):
        hop8 = last[(last["transport"] == transport) &
                    (last["sync_pow"] == sync_pow) &
                    (last["lwin"] == lwin)]
        if hop8.empty or pd.isna(hop8[RES_COL].iloc[0]):
            continue
        hop8_val = float(hop8[RES_COL].iloc[0])
        survivors_rows.append({
            "transport": transport,
            "sync_pow": int(sync_pow) if pd.notna(sync_pow) else None,
            "lwin": int(lwin) if pd.notna(lwin) else None,
            "hop8_p95_ns": hop8_val,
            "survives_threshold": hop8_val <= args.threshold_ns,
            "is_best": best_key == (transport, int(sync_pow), int(lwin)) if best_key else False,
        })

    summary_df = pd.DataFrame(sorted(survivors_rows, key=lambda r: r["hop8_p95_ns"]))
    summary_csv = out_dir / "linreg_all_in_one_summary.csv"
    summary_df.to_csv(summary_csv, index=False)

    with log_path.open("a") as log:
        log.write(f"Analysis finished: {datetime.now().isoformat()}\n")
        log.write(f"Wrote: {combined_csv}\n")
        log.write(f"Wrote: {summary_csv}\n")
        log.write(f"Wrote: {out_dir / 'spaghetti_ALL_LINREG.png'}\n")

    print(f"✓ LINREG analysis complete. Results in: {out_dir}")
    print(f"  - combined CSV: {combined_csv}")
    print(f"  - summary CSV:  {summary_csv}")
    print(f"  - plot:         spaghetti_ALL_LINREG.png")
    print(f"  - log:          {log_path}")

if __name__ == "__main__":
    main()