""" Extract embedded images from each PPT in C:\\Users\\admin\\Downloads\\illustrations into images//NN.. Topic slug is derived from the PPT filename. Internal ordering within a topic is not semantic (the team curated each PPT as a topic set), so images are numbered 01..NN in the order they appear inside the PPT's ppt/media/ folder. Original extension (png / jpeg) is preserved. No resizing is performed. Safety: refuses to overwrite an existing target file. Refuses to write into a topic folder that already contains files (so this script can't clobber prior work). """ from __future__ import annotations import re import sys import zipfile from pathlib import Path SRC = Path(r"C:\Users\admin\Downloads\illustrations") DST_ROOT = Path(r"C:\Users\admin\Desktop\v2\images") IMAGE_PREFIX = "ppt/media/" IMAGE_EXTS = {".png", ".jpg", ".jpeg"} def topic_slug(stem: str) -> str: s = stem.lower() s = re.sub(r"[&]", "and", s) s = re.sub(r"[^a-z0-9]+", "_", s).strip("_") return s def extract(pptx_path: Path, dst_dir: Path) -> tuple[int, int]: """Returns (files_written, bytes_written).""" if dst_dir.exists() and any(dst_dir.iterdir()): print(f" [skip] {dst_dir} already has files; refusing to clobber.") return (0, 0) dst_dir.mkdir(parents=True, exist_ok=True) written = 0 bytes_written = 0 with zipfile.ZipFile(pptx_path) as zf: media = [ i for i in zf.infolist() if i.filename.startswith(IMAGE_PREFIX) and Path(i.filename).suffix.lower() in IMAGE_EXTS ] for idx, info in enumerate(media, start=1): ext = Path(info.filename).suffix.lower() if ext == ".jpg": ext = ".jpeg" target = dst_dir / f"{idx:02d}{ext}" if target.exists(): print(f" [skip] {target} exists.") continue with zf.open(info) as src, open(target, "wb") as dst: data = src.read() dst.write(data) bytes_written += len(data) written += 1 return written, bytes_written def main() -> int: if not SRC.exists(): print(f"[error] source folder not found: {SRC}", file=sys.stderr) return 1 ppts = sorted(SRC.glob("*.pptx")) if not ppts: print("[warn] no .pptx files found.") return 0 total_files = 0 total_bytes = 0 print("EXTRACTION REPORT") print("-" * 78) print(f"{'topic_slug':<48} {'files':>6} {'size_MB':>10}") print("-" * 78) for p in ppts: slug = topic_slug(p.stem) dst = DST_ROOT / slug files, byts = extract(p, dst) total_files += files total_bytes += byts print(f"{slug:<48} {files:>6} {byts/1024/1024:>10.2f}") print("-" * 78) print(f"{'TOTAL':<48} {total_files:>6} {total_bytes/1024/1024:>10.2f}") print() print(f"Destination: {DST_ROOT}") return 0 if __name__ == "__main__": raise SystemExit(main())