# -*- coding: utf-8 -*-
"""
search_xpattern_lift_due.py
===========================

What this script does
---------------------
We search for x-column patterns (0/1 patterns over 5 or 6 x-columns) that look:

1) HEAVY-LIFT:
   They shrink the space (tot_df) a lot, but keep the history (hist_df) alive
   in a better-than-random way. We quantify that with ratio_of_ratios.

2) DUE:
   The filtered history subset looks "late" compared to its own interval history.
   We quantify that with pct_score from delay_percentile_summary().

3) NOT TOO TINY:
   We add a penalty if the filtered history subset has too few hits (support).
   Without this, tiny subsets can look amazing by accident.

Outputs
-------
- prints top candidates (as a DataFrame)
- saves:
    data/xpattern_candidates_top.csv
    data/xpattern_blog_snippet.md   (markdown table of the best patterns)

Requirements
------------
- data/hist_df.csv
- data/tot_df_dynamic_basic.parquet
- nextnumbers_toolkit_v2.py must be importable (same folder or on PYTHONPATH)

Run
---
python search_xpattern_lift_due.py
"""

from __future__ import annotations

from pathlib import Path
from itertools import combinations, product
from typing import List, Sequence, Tuple, Dict, Any, Optional
import ast

import numpy as np
import pandas as pd

from nextnumbers_toolkit_v2 import delay_percentile_summary


# ---------------------------------------------------------------------
# Config (tweak these)
# ---------------------------------------------------------------------

PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"

HIST_PATH = DATA_DIR / "hist_df.csv"
TOT_PATH  = DATA_DIR / "tot_df_dynamic_basic.parquet"

# Search space
K_VALUES = (5, 6)              # test 5-col and 6-col patterns
PATTERN_VALUES = (0, 1)        # binary patterns only

# Sampling controls (important, because filtering tot_df repeatedly can be slow)
MAX_COMBOS_PER_K = 2500        # how many column-combos to test per k (sampled)
SEED = 123

# Pruning controls
MIN_HIST_HITS = 10             # skip patterns that barely appear in history
MIN_SPACE_ROWS = 5000          # skip patterns that leave tot_df too tiny

# Support penalty controls
MIN_SUPPORT_HITS = 30          # below this, we penalize hard
SUPPORT_TAU = 35.0             # smoothness for the support factor

# Output
TOP_N = 20
OUT_CSV = DATA_DIR / "xpattern_candidates_top.csv"
OUT_MD  = DATA_DIR / "xpattern_blog_snippet.md"


# ---------------------------------------------------------------------
# Lockstep filter: columns + exact pattern
# ---------------------------------------------------------------------

def filter_space_and_history_on_pattern(
    df_space: pd.DataFrame,
    df_hist: pd.DataFrame,
    cols: Sequence[str],
    pattern: Sequence[int],
    *,
    coerce_numeric: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
    """
    Filter BOTH dfs so df[cols] matches pattern exactly.

    Example:
      cols    = ["x7","x9","x12","x17","x19"]
      pattern = [1,1,1,0,0]
    """
    cols = list(cols)
    pattern = list(pattern)

    if len(cols) == 0:
        raise ValueError("cols must not be empty")
    if len(cols) != len(pattern):
        raise ValueError("cols and pattern must have same length")

    # Ensure columns exist
    miss_space = [c for c in cols if c not in df_space.columns]
    miss_hist  = [c for c in cols if c not in df_hist.columns]
    if miss_space:
        raise KeyError(f"Missing in df_space: {miss_space}")
    if miss_hist:
        raise KeyError(f"Missing in df_hist: {miss_hist}")

    def _mask(df: pd.DataFrame) -> np.ndarray:
        X = df.loc[:, cols]
        if coerce_numeric:
            X = X.apply(pd.to_numeric, errors="coerce")
        m = np.ones(len(df), dtype=bool)
        for c, v in zip(cols, pattern):
            m &= (X[c].to_numpy() == v)
        return m

    m_space = _mask(df_space)
    m_hist = _mask(df_hist)

    df_space_f = df_space.loc[m_space].copy()
    df_hist_f = df_hist.loc[m_hist].copy()

    # Reduction metrics
    len_space_f = len(df_space_f)
    len_hist_f = len(df_hist_f)
    len_space_0 = len(df_space)
    len_hist_0 = len(df_hist)

    df_ratio = (len_space_f / len_space_0) if len_space_0 else 0.0
    df_n_ratio = (len_hist_f / len_hist_0) if len_hist_0 else 0.0
    ratio_of_ratios = (df_n_ratio / df_ratio) if df_ratio else 0.0

    metrics = {
        "df_ratio": float(df_ratio),
        "df_n_ratio": float(df_n_ratio),
        "ratio_of_ratios": float(ratio_of_ratios),
        "space_filtered": int(len_space_f),
        "space_original": int(len_space_0),
        "hist_filtered": int(len_hist_f),
        "hist_original": int(len_hist_0),
        "cols": cols,
        "pattern": pattern,
    }

    return df_space_f, df_hist_f, metrics


# ---------------------------------------------------------------------
# Due summary (delaypercent)
# ---------------------------------------------------------------------

def calculate_delaypercent(df_subset: pd.DataFrame, df_full: pd.DataFrame, label: str = "filter") -> pd.DataFrame:
    """Small wrapper for cleaner naming."""
    return delay_percentile_summary(df_subset, df_full, label=label)


# ---------------------------------------------------------------------
# Score: heavy lift + due + support penalty
# ---------------------------------------------------------------------

def support_factor(hit_count: int, *, min_support_hits: int = MIN_SUPPORT_HITS, tau: float = SUPPORT_TAU) -> float:
    """
    Smooth penalty for tiny subsets.

    - For small hit_count, factor is near 0.
    - As hit_count grows, factor approaches 1.
    - Below min_support_hits, we also clamp down.
    """
    hc = float(max(0, hit_count))
    smooth = 1.0 - float(np.exp(-hc / float(tau)))

    if hc < min_support_hits:
        clamp = hc / float(min_support_hits)
        return smooth * clamp

    return smooth


def heavy_lift_due_score(metrics: Dict[str, Any], delay_row: pd.Series) -> float:
    """
    score = ratio_of_ratios * (pct_score/100) * support_factor(hit_count)
    """
    lift_part = float(metrics.get("ratio_of_ratios", 0.0))
    pct = float(delay_row.get("pct_score", np.nan))
    hit_count = int(delay_row.get("hit_count", 0))

    if not np.isfinite(pct) or hit_count <= 0:
        return 0.0

    due_part = pct / 100.0
    sup_part = support_factor(hit_count)

    return lift_part * due_part * sup_part


# ---------------------------------------------------------------------
# Search helpers
# ---------------------------------------------------------------------

def detect_x_cols(hist: pd.DataFrame) -> List[str]:
    x_cols = [c for c in hist.columns if c.startswith("x") and c[1:].isdigit()]
    return sorted(x_cols, key=lambda s: int(s[1:]))


def sample_combos(all_combos: List[Tuple[str, ...]], n: int, rng: np.random.Generator) -> List[Tuple[str, ...]]:
    if n is None or n >= len(all_combos):
        return all_combos
    idx = rng.choice(len(all_combos), size=int(n), replace=False)
    return [all_combos[i] for i in idx]


def search_best_x_patterns(
    tot: pd.DataFrame,
    hist: pd.DataFrame,
    *,
    x_cols: Sequence[str],
) -> pd.DataFrame:
    rng = np.random.default_rng(SEED)
    results: List[Dict[str, Any]] = []

    x_cols = list(x_cols)

    for k in K_VALUES:
        all_combos = list(combinations(x_cols, k))
        combos_to_test = sample_combos(all_combos, MAX_COMBOS_PER_K, rng)

        for cols in combos_to_test:
            for patt in product(PATTERN_VALUES, repeat=k):
                # hist-only prune (cheap)
                hist_tmp = hist
                ok = True
                for c, v in zip(cols, patt):
                    hist_tmp = hist_tmp[hist_tmp[c] == v]
                    if len(hist_tmp) < MIN_HIST_HITS:
                        ok = False
                        break
                if not ok:
                    continue

                tot_f, hist_f, metrics = filter_space_and_history_on_pattern(tot, hist, cols, patt)

                if len(hist_f) < MIN_HIST_HITS:
                    continue
                if len(tot_f) < MIN_SPACE_ROWS:
                    continue

                delay_df = calculate_delaypercent(hist_f, hist, label=f"{cols}:{patt}")
                delay_row = delay_df.iloc[0]
                score = heavy_lift_due_score(metrics, delay_row)

                results.append({
                    "score": float(score),
                    "k": int(k),
                    "cols": tuple(cols),
                    "pattern": tuple(patt),
                    "space_rows": int(metrics["space_filtered"]),
                    "hist_rows": int(metrics["hist_filtered"]),
                    "df_ratio": float(metrics["df_ratio"]),
                    "df_n_ratio": float(metrics["df_n_ratio"]),
                    "ratio_of_ratios": float(metrics["ratio_of_ratios"]),
                    "hit_count": int(delay_row["hit_count"]),
                    "draws_since_last": int(delay_row["draws_since_last"]),
                    "pct_score": float(delay_row["pct_score"]),
                    "P90": float(delay_row["P90"]),
                    "P95": float(delay_row["P95"]),
                    "P99": float(delay_row["P99"]),
                    "support_factor": float(support_factor(int(delay_row["hit_count"]))),
                })

    out = pd.DataFrame(results)
    if out.empty:
        return out
    out = out.sort_values("score", ascending=False).reset_index(drop=True)
    return out.head(TOP_N)


# ---------------------------------------------------------------------
# CSV -> usable tuples (because pandas reads tuples as strings)
# ---------------------------------------------------------------------

def _parse_tuple_cell(x: Any) -> Tuple[Any, ...]:
    """
    Convert "('x1','x2')" or "('x1', 'x2')" back to a tuple safely.
    If it's already a tuple/list, return tuple(x).
    """
    if isinstance(x, tuple):
        return x
    if isinstance(x, list):
        return tuple(x)
    if isinstance(x, str):
        s = x.strip()
        try:
            val = ast.literal_eval(s)
            if isinstance(val, (tuple, list)):
                return tuple(val)
        except Exception:
            pass
    raise ValueError(f"Could not parse tuple cell: {x!r}")


def load_candidates_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # cols/pattern were saved as tuples -> strings in CSV
    if "cols" in df.columns:
        df["cols"] = df["cols"].apply(_parse_tuple_cell)
    if "pattern" in df.columns:
        df["pattern"] = df["pattern"].apply(_parse_tuple_cell)
    return df


# ---------------------------------------------------------------------
# Markdown report (patterns only — no number ranking)
# ---------------------------------------------------------------------

def candidates_markdown_report(df_top: pd.DataFrame, *, title: str, top_n: int = 20) -> str:
    """
    Build a markdown report from the already-saved candidates CSV.
    Focus: patterns + scores + due/lift/support columns.
    """
    if df_top.empty:
        return f"# {title}\n\nNo candidates found.\n"

    show = df_top.head(top_n).copy()

    # Make cols/pattern readable strings
    show["cols"] = show["cols"].apply(lambda t: "[" + ", ".join(map(str, t)) + "]")
    show["pattern"] = show["pattern"].apply(lambda t: "[" + ", ".join(map(str, t)) + "]")

    # Column order for the blog table
    keep_cols = [
        "score", "k", "cols", "pattern",
        "ratio_of_ratios", "df_ratio", "df_n_ratio",
        "space_rows", "hist_rows",
        "pct_score", "draws_since_last", "hit_count",
        "support_factor",
        "P90", "P95", "P99",
    ]
    keep_cols = [c for c in keep_cols if c in show.columns]
    show = show[keep_cols]

    lines: List[str] = []
    lines.append(f"# {title}\n")
    lines.append("## Best x-pattern candidates (ranked)\n")
    lines.append(show.to_markdown(index=False))
    lines.append("")
    lines.append("## Notes\n")
    lines.append("- This ranks pattern-filters, not outcomes.")
    lines.append("- `ratio_of_ratios` > 1 means the filter keeps history alive better than random shrinking.")
    lines.append("- `pct_score` close to 100 means the subset looks late relative to its own gaps.")
    lines.append("- `support_factor` pushes down tiny-hit patterns so we don’t get hypnotized by noise.")
    return "\n".join(lines)


# ---------------------------------------------------------------------
# Best-candidate filter function (you asked for this)
# ---------------------------------------------------------------------

def filter_with_best_candidate(
    tot: pd.DataFrame,
    hist: pd.DataFrame,
    *,
    candidates_csv_path: Path = OUT_CSV,
    k: Optional[int] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any], pd.Series]:
    """
    Load xpattern_candidates_top.csv, pick the best candidate (optionally force k=5 or k=6),
    and return (tot_filtered, hist_filtered, metrics, best_row).

    k=None  -> best overall
    k=5     -> best among 5-col patterns
    k=6     -> best among 6-col patterns
    """
    if not candidates_csv_path.exists():
        raise FileNotFoundError(f"Candidates CSV not found: {candidates_csv_path}")

    df = load_candidates_csv(candidates_csv_path)
    if df.empty:
        raise RuntimeError("Candidates CSV is empty.")

    if k is not None:
        df = df[df["k"] == int(k)].copy()
        if df.empty:
            raise RuntimeError(f"No candidates for k={k} in {candidates_csv_path.name}")

    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    best = df.iloc[0]

    cols = list(best["cols"])
    patt = list(best["pattern"])

    tot_f, hist_f, metrics = filter_space_and_history_on_pattern(tot, hist, cols, patt)
    return tot_f, hist_f, metrics, best


# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------

def main() -> None:
    print("Loading:", HIST_PATH)
    hist = pd.read_csv(HIST_PATH)

    print("Loading:", TOT_PATH)
    tot = pd.read_parquet(TOT_PATH)

    x_cols = detect_x_cols(hist)
    if len(x_cols) < 10:
        raise RuntimeError(f"Not enough x-columns found. Found: {len(x_cols)}")

    print("x-columns detected:", len(x_cols))

    top = search_best_x_patterns(tot, hist, x_cols=x_cols)

    if top.empty:
        print("No candidates found. Try lowering MIN_HIST_HITS / MIN_SPACE_ROWS / MAX_COMBOS_PER_K.")
        return

    print("\n=== Top candidates ===")
    print(top)

    # Save CSV
    OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    top.to_csv(OUT_CSV, index=False)
    print("\nSaved:", OUT_CSV)

    # IMPORTANT: markdown comes from the CSV (as requested)
    df_top = load_candidates_csv(OUT_CSV)
    md = candidates_markdown_report(
        df_top,
        title="Heavy-lift x-pattern filters (ranked search)",
        top_n=TOP_N,
    )
    OUT_MD.write_text(md, encoding="utf-8")
    print("Saved:", OUT_MD)

    # Example: filter with the best overall candidate
    tot_best, hist_best, metrics_best, best_row = filter_with_best_candidate(tot, hist, candidates_csv_path=OUT_CSV, k=None)
    print("\n=== Best candidate (applied) ===")
    print("k:", int(best_row["k"]))
    print("cols:", list(best_row["cols"]))
    print("pattern:", list(best_row["pattern"]))
    print("space rows:", metrics_best["space_filtered"], "/", metrics_best["space_original"])
    print("hist rows:", metrics_best["hist_filtered"], "/", metrics_best["hist_original"])
    print("ratio_of_ratios:", f"{metrics_best['ratio_of_ratios']:.4f}")

    # If you want the best k=5 OR best k=6 specifically, call:
    #   filter_with_best_candidate(tot, hist, candidates_csv_path=OUT_CSV, k=5)
    #   filter_with_best_candidate(tot, hist, candidates_csv_path=OUT_CSV, k=6)


if __name__ == "__main__":
    main()
