# -*- coding: utf-8 -*-
"""
backtest_percentile_buckets.py
------------------------------------------------------------
Walk-forward backtest for percentile-gap ranking + bucket tests.

We:
  1) Rank values (e.g. 1..50) using calculate_percentiles_multi_columns()
  2) Split the ranked list into:
        A) 3 buckets (16 / 17 / 17 for N=50)
        B) 2 buckets (25 / 25)
        C) middle 68% bucket (34 numbers) = drop top 8 + bottom 8
  3) For each time step t, build ranking from history up to t,
     then score how many of the NEXT draw numbers fall into each bucket.

Outputs:
  - backtest_bucket_hits_per_draw.csv
  - backtest_bucket_summary.csv
"""

from __future__ import annotations

import os
import numpy as np
import pandas as pd

# Try SciPy percentile-of-score; fallback to a simple definition if SciPy missing.
try:
    from scipy import stats as _scipy_stats
    HAS_SCIPY = True
except Exception:
    HAS_SCIPY = False


# -----------------------------
# CONFIG (edit these)
# -----------------------------
HIST_CSV = r"data/historical_draws.csv"   # <- change if needed
OUT_DIR  = r"data"                        # <- where to save csv outputs

FEATURE_COLS = ["st1", "st2", "st3", "st4", "st5"]  # can be D1..D4, k1..k5, K1..K5 too
DOMAIN_VALUES = list(range(1, 51))  # for st1..st5 use 1..50. For D1..D4 maybe 1..49, etc.

BACKTEST_START = 50   # start testing at draw index 50 (0-based)
PRINT_EVERY = 50      # progress print every N steps


# -----------------------------
# Core function (safe version)
# -----------------------------
def calculate_percentiles_multi_columns(data_frame: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """
    Flatten values across multiple columns and rank them by a spacing-based score.

    For each unique value v across the chosen columns:
      - Find the row positions where v appears (in any of the columns).
      - Build the gaps between hits:
            [first_hit_pos] + [diff between hits] + [tail gap to end]
      - Compute:
            count (co)  = number of intervals (len(gaps)-1)
            delay       = last gap (tail gap)
            percentiles = median, P75, P90, P95, P99, max
            Pct_score   = percentile rank of 'delay' inside its own gap list
      - Norm = 100 * count / total_count
      - Prod = Norm * Pct_score
    Higher Prod ranks higher.

    IMPORTANT:
      We use row POSITIONS (0..n-1), not df.index values,
      so slicing / resetting index won’t break the gap math.
    """
    if not all(c in data_frame.columns for c in columns):
        missing = [c for c in columns if c not in data_frame.columns]
        raise ValueError(f"Columns {missing} not found in DataFrame")

    if data_frame.empty:
        return pd.DataFrame(columns=["value","co","delay","median","P75","P90","P95","P99","max","Pct_score","Norm","Prod"])

    # Unique values seen in these columns
    unique_values = pd.unique(data_frame[columns].values.ravel("K"))

    n = len(data_frame)
    rows = []

    # Pre-extract the block for speed
    block = data_frame[columns]

    for v in unique_values:
        # row positions where v appears in ANY of the columns
        mask = block.isin([v]).any(axis=1).to_numpy()
        idx = np.flatnonzero(mask)  # 0..n-1

        if idx.size == 0:
            continue

        gaps = [int(idx[0])]
        gaps += [int(idx[i] - idx[i - 1]) for i in range(1, len(idx))]
        gaps.append(int(n - idx[-1]))

        if len(gaps) <= 1:
            continue

        gaps_arr = np.asarray(gaps, dtype=float)
        delay = gaps[-1]

        if HAS_SCIPY:
            pct_score = float(_scipy_stats.percentileofscore(gaps_arr, delay))
        else:
            # fallback: % of gaps <= delay
            pct_score = 100.0 * float(np.mean(gaps_arr <= delay))

        rows.append({
            "value": int(v) if pd.notna(v) else v,
            "co": len(gaps) - 1,
            "delay": int(delay),
            "median": float(np.percentile(gaps_arr, 50)),
            "P75": float(np.percentile(gaps_arr, 75)),
            "P90": float(np.percentile(gaps_arr, 90)),
            "P95": float(np.percentile(gaps_arr, 95)),
            "P99": float(np.percentile(gaps_arr, 99)),
            "max": float(np.max(gaps_arr)),
            "Pct_score": float(pct_score),
        })

    if not rows:
        return pd.DataFrame(columns=["value","co","delay","median","P75","P90","P95","P99","max","Pct_score","Norm","Prod"])

    results = pd.DataFrame(rows)
    results["Norm"] = (100.0 * results["co"] / results["co"].sum()).round(2)
    results["Prod"] = (results["Norm"] * results["Pct_score"]).round(4)
    results = results.sort_values("Prod", ascending=False).reset_index(drop=True)
    return results


# -----------------------------
# Bucketing helpers
# -----------------------------
def bucket_sizes_three(n: int) -> tuple[int,int,int]:
    # for N=50 -> 16,17,17 (distribute remainder to later buckets)
    base = n // 3
    rem = n % 3
    a = base
    b = base + (1 if rem >= 2 else 0)
    c = base + (1 if rem >= 1 else 0)
    # This yields (16,17,17) for 50
    # (Note: if you want (17,16,17) just swap b/a, etc.)
    return (a, b, c)

def make_ranked_list(table: pd.DataFrame, domain_values: list[int]) -> list[int]:
    """
    Turn the ranking table into a full ranked list over the given domain.
    Any domain values not present in the table get appended at the end.
    """
    ranked = table["value"].astype(int).tolist() if "value" in table.columns and len(table) else []
    ranked_set = set(ranked)
    missing = [int(v) for v in domain_values if int(v) not in ranked_set]
    return ranked + missing

def hits_in_group(draw_numbers: list[int], group: set[int]) -> int:
    return int(sum(1 for x in draw_numbers if x in group))


# -----------------------------
# Backtest runner
# -----------------------------
def run_backtest():
    os.makedirs(OUT_DIR, exist_ok=True)

    df = pd.read_csv(HIST_CSV)

    # ensure numeric
    for c in FEATURE_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

    n_total = len(df)
    if n_total < BACKTEST_START + 2:
        raise ValueError(f"Not enough rows ({n_total}) to start at BACKTEST_START={BACKTEST_START}")

    n_domain = len(DOMAIN_VALUES)

    # precompute expected hits baselines
    a,b,c = bucket_sizes_three(n_domain)
    expected_3 = {
        "G1": 5.0 * a / n_domain,
        "G2": 5.0 * b / n_domain,
        "G3": 5.0 * c / n_domain,
    }
    expected_2 = {
        "H1": 5.0 * (n_domain//2) / n_domain,
        "H2": 5.0 * (n_domain - n_domain//2) / n_domain,
    }
    # middle 68%: for N=50 -> 34 = drop8+drop8
    drop = (n_domain - int(round(0.68 * n_domain))) // 2  # for 50 -> (50-34)//2=8
    mid_size = n_domain - 2*drop
    expected_mid = 5.0 * mid_size / n_domain

    per_draw_rows = []

    for t in range(BACKTEST_START, n_total - 1):
        hist = df.iloc[:t+1].copy().reset_index(drop=True)
        nxt  = df.iloc[t+1]

        next_nums = [int(nxt[c]) for c in FEATURE_COLS if pd.notna(nxt[c])]

        table = calculate_percentiles_multi_columns(hist, FEATURE_COLS)
        ranked = make_ranked_list(table, DOMAIN_VALUES)

        # --- 3 buckets ---
        g1 = set(ranked[:a])
        g2 = set(ranked[a:a+b])
        g3 = set(ranked[a+b:])

        h_g1 = hits_in_group(next_nums, g1)
        h_g2 = hits_in_group(next_nums, g2)
        h_g3 = hits_in_group(next_nums, g3)

        # --- 2 buckets ---
        half = n_domain // 2
        h1 = set(ranked[:half])
        h2 = set(ranked[half:])
        h_h1 = hits_in_group(next_nums, h1)
        h_h2 = hits_in_group(next_nums, h2)

        # --- middle 68% bucket (drop extremes of ranking) ---
        mid = set(ranked[drop: n_domain - drop])
        edge = set(ranked[:drop] + ranked[n_domain - drop:])
        h_mid = hits_in_group(next_nums, mid)
        h_edge = hits_in_group(next_nums, edge)

        per_draw_rows.append({
            "t": t,
            "history_len": t+1,
            "next_draw_index": t+1,
            "next_numbers": "-".join(map(str, sorted(next_nums))),

            "3bucket_G1_hits": h_g1,
            "3bucket_G2_hits": h_g2,
            "3bucket_G3_hits": h_g3,

            "2bucket_H1_hits": h_h1,
            "2bucket_H2_hits": h_h2,

            "mid68_hits": h_mid,
            "edge16_hits": h_edge,

            # handy diagnostics
            "drop_each_side": drop,
            "mid68_size": mid_size,
        })

        if (t - BACKTEST_START) % PRINT_EVERY == 0:
            print(f"[progress] t={t} / {n_total-2}")

    per_draw = pd.DataFrame(per_draw_rows)

    # -----------------------------
    # Summaries
    # -----------------------------
    def summarize_hits(series: pd.Series, expected: float, label: str) -> dict:
        s = series.astype(int)
        return {
            "label": label,
            "n_steps": int(len(s)),
            "avg_hits": float(s.mean()),
            "expected_hits": float(expected),
            "ratio_vs_expected": float(s.mean() / expected) if expected > 0 else np.nan,
            "pct_ge3": float((s >= 3).mean() * 100.0),
            "pct_ge4": float((s >= 4).mean() * 100.0),
            "pct_eq5": float((s == 5).mean() * 100.0),
            "pct_eq0": float((s == 0).mean() * 100.0),
        }

    summary_rows = []
    # 3-bucket
    summary_rows.append(summarize_hits(per_draw["3bucket_G1_hits"], expected_3["G1"], "3bucket_G1"))
    summary_rows.append(summarize_hits(per_draw["3bucket_G2_hits"], expected_3["G2"], "3bucket_G2"))
    summary_rows.append(summarize_hits(per_draw["3bucket_G3_hits"], expected_3["G3"], "3bucket_G3"))
    # 2-bucket
    summary_rows.append(summarize_hits(per_draw["2bucket_H1_hits"], expected_2["H1"], "2bucket_H1"))
    summary_rows.append(summarize_hits(per_draw["2bucket_H2_hits"], expected_2["H2"], "2bucket_H2"))
    # mid 68%
    summary_rows.append(summarize_hits(per_draw["mid68_hits"], expected_mid, "mid68"))
    # edge 16 (complement)
    summary_rows.append(summarize_hits(per_draw["edge16_hits"], 5.0 * (2*drop) / n_domain, "edge16"))

    summary = pd.DataFrame(summary_rows).sort_values(["ratio_vs_expected", "avg_hits"], ascending=False)

    # -----------------------------
    # Save
    # -----------------------------
    per_draw_path = os.path.join(OUT_DIR, "backtest_bucket_hits_per_draw.csv")
    summary_path  = os.path.join(OUT_DIR, "backtest_bucket_summary.csv")

    per_draw.to_csv(per_draw_path, index=False)
    summary.to_csv(summary_path, index=False)

    print("\nSaved:")
    print(" -", per_draw_path)
    print(" -", summary_path)

    print("\nTop summary rows:")
    print(summary.head(10).to_string(index=False))

    return per_draw, summary


if __name__ == "__main__":
    run_backtest()