#!/usr/bin/env python3
"""Retention analyzer: reads sample watch sessions and computes retention curve and average watch time.
Expect CSV columns: sessionId, videoId, userId, watchSeconds, videoDurationSeconds, watchedPercent, timestamp
Outputs an Excel with summary and a PNG chart.
Usage:
  python retention_analyzer.py --in example_data/sample_watch_sessions.csv --out reports/retention_report.xlsx
"""
import argparse, os, pandas as pd
import matplotlib.pyplot as plt

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--in', dest='infile', required=True)
    parser.add_argument('--out', dest='outfile', default='retention_report.xlsx')
    args = parser.parse_args()
    df = pd.read_csv(args.infile)
    # compute avg watch time per video
    summary = df.groupby('videoId').agg(
        sessions=('sessionId','nunique'),
        avg_watch_seconds=('watchSeconds','mean'),
        avg_watched_percent=('watchedPercent','mean'),
        total_watch_seconds=('watchSeconds','sum')
    ).reset_index().sort_values('total_watch_seconds', ascending=False)
    os.makedirs(os.path.dirname(args.outfile) or '.', exist_ok=True)
    # save excel
    with pd.ExcelWriter(args.outfile, engine='openpyxl') as writer:
        summary.to_excel(writer, sheet_name='summary', index=False)
        df.to_excel(writer, sheet_name='raw_sessions', index=False)
    # plot top 5 videos retention (avg watched percent)
    top5 = summary.head(5)
    plt.figure(figsize=(8,5))
    plt.plot(top5['videoId'], top5['avg_watched_percent'])
    plt.title('Top 5 videos - Avg Watched Percent')
    plt.xlabel('videoId')
    plt.ylabel('avg watched percent')
    png = os.path.splitext(args.outfile)[0] + '_top5_retention.png'
    plt.savefig(png)
    print('Wrote report to', args.outfile, 'and chart to', png)

if __name__ == '__main__':
    main()
