"""
Dry-run: scan each PPT in C:\\Users\\admin\\Downloads\\illustrations,
list every embedded image without writing anything to disk.

Output: per-PPT summary + a CSV-style table to stdout.
"""
from __future__ import annotations

import re
import sys
import zipfile
from pathlib import Path

SRC = Path(r"C:\Users\admin\Downloads\illustrations")
IMAGE_PREFIX = "ppt/media/"
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".emf", ".wmf", ".svg"}


def topic_slug(stem: str) -> str:
    """Turn 'Public Speaking and Presentation Skills' -> 'public_speaking_and_presentation_skills'."""
    s = stem.lower()
    s = re.sub(r"[&]", "and", s)
    s = re.sub(r"[^a-z0-9]+", "_", s).strip("_")
    return s


def scan(pptx_path: Path) -> list[dict]:
    rows: list[dict] = []
    with zipfile.ZipFile(pptx_path) as zf:
        for info in zf.infolist():
            if not info.filename.startswith(IMAGE_PREFIX):
                continue
            ext = Path(info.filename).suffix.lower()
            if ext not in IMAGE_EXTS:
                continue
            rows.append({
                "source_pptx": pptx_path.name,
                "topic_slug":  topic_slug(pptx_path.stem),
                "internal":    info.filename,
                "ext":         ext,
                "size_kb":     round(info.file_size / 1024, 1),
            })
    return rows


def main() -> int:
    if not SRC.exists():
        print(f"[error] source folder not found: {SRC}", file=sys.stderr)
        return 1

    ppts = sorted(SRC.glob("*.pptx"))
    if not ppts:
        print("[warn] no .pptx files found")
        return 0

    all_rows: list[dict] = []
    print("PER-PPTX SUMMARY")
    print("-" * 88)
    print(f"{'topic_slug':<48} {'images':>7} {'ext_mix':<20} {'total_KB':>10}")
    print("-" * 88)
    for p in ppts:
        rows = scan(p)
        all_rows.extend(rows)
        ext_counts: dict[str, int] = {}
        total_kb = 0.0
        for r in rows:
            ext_counts[r["ext"]] = ext_counts.get(r["ext"], 0) + 1
            total_kb += r["size_kb"]
        ext_mix = ", ".join(f"{k.lstrip('.')}={v}" for k, v in sorted(ext_counts.items()))
        print(f"{topic_slug(p.stem):<48} {len(rows):>7} {ext_mix:<20} {total_kb:>10.1f}")

    print("-" * 88)
    print(f"{'TOTAL':<48} {len(all_rows):>7}")
    print()

    # Ext aggregation
    print("EXTENSION TOTALS (across all PPTs)")
    print("-" * 40)
    ext_total: dict[str, int] = {}
    for r in all_rows:
        ext_total[r["ext"]] = ext_total.get(r["ext"], 0) + 1
    for k, v in sorted(ext_total.items(), key=lambda kv: -kv[1]):
        print(f"  {k:<8} {v:>5}")
    print()

    # Sample 8 rows to show structure
    print("SAMPLE ROWS (first 8)")
    print("-" * 88)
    for r in all_rows[:8]:
        print(f"  {r['topic_slug'][:30]:<30} {r['internal']:<28} {r['size_kb']:>8.1f} KB")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
