"""scripts/upload_banks_to_s3.py — push bank assets to S3. Walks four local directories and uploads everything under ``banks//`` in the configured S3 bucket: tagged infographics/ -> s3:///banks/tagged infographics/... drill_down_bank/*.pptx -> s3:///banks/drill_down_bank/... sample template/ -> s3:///banks/sample template/... images/ -> s3:///banks/images/... Designed to be idempotent: on re-runs, files already present in S3 with the same byte size are skipped. Pass ``--force`` to re-upload everything. Configuration via .env or env vars: S3_BUCKET bucket name (required) AWS_REGION e.g. us-east-2 (required) AWS_ACCESS_KEY_ID IAM user key (required) AWS_SECRET_ACCESS_KEY IAM user secret (required) S3_BANKS_PREFIX default 'banks/' — top-level key prefix Run from the project root: python scripts/upload_banks_to_s3.py # idempotent python scripts/upload_banks_to_s3.py --force # re-upload all """ from __future__ import annotations import argparse import os import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) try: from dotenv import load_dotenv load_dotenv(ROOT / ".env") except ImportError: pass import boto3 from botocore.exceptions import ClientError # ── Config ──────────────────────────────────────────────────────────────────── BUCKET = os.environ.get("S3_BUCKET") REGION = os.environ.get("AWS_REGION") PREFIX = os.environ.get("S3_BANKS_PREFIX", "banks/") if not BUCKET or not REGION: print("ERROR: S3_BUCKET and AWS_REGION must be set (in .env or environment).", file=sys.stderr) sys.exit(1) # Source directories to mirror under banks/. # Keys preserve the local layout (spaces stay as spaces — boto3 handles them). SOURCES = [ ROOT / "tagged infographics", ROOT / "drill_down_bank", ROOT / "sample template", ROOT / "images", ] SKIP_PATTERNS = {".DS_Store", "Thumbs.db", "desktop.ini"} SKIP_DIRS = {"__pycache__", ".git"} # In drill_down_bank/ we want only the PPT(s), not the catalogue JSON which # is tracked in git and gets shipped with the code. DRILL_DOWN_EXTS_KEEP = {".pptx"} def _should_skip(path: Path) -> bool: if path.name in SKIP_PATTERNS: return True if any(part in SKIP_DIRS for part in path.parts): return True if path.name.startswith("._"): return True if path.suffix in (".py",): # defensive return True # In drill_down_bank/ only keep .pptx (the catalogue JSON is in git). if (ROOT / "drill_down_bank") in path.parents: if path.suffix.lower() not in DRILL_DOWN_EXTS_KEEP: return True return False def _all_files() -> list[Path]: files: list[Path] = [] for src in SOURCES: if not src.exists(): print(f" [skip] {src.relative_to(ROOT)} (does not exist)") continue for p in src.rglob("*"): if not p.is_file(): continue if _should_skip(p): continue files.append(p) return files def _s3_key(local_path: Path) -> str: rel = local_path.relative_to(ROOT).as_posix() # forward-slash path return f"{PREFIX}{rel}" def _remote_size(s3, key: str) -> int | None: """Return size of an existing S3 object, or None if missing.""" try: head = s3.head_object(Bucket=BUCKET, Key=key) return head["ContentLength"] except ClientError as e: if e.response["Error"]["Code"] in ("404", "NoSuchKey", "NotFound"): return None raise def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--force", action="store_true", help="Re-upload even files that already exist with same size.") parser.add_argument("--dry-run", action="store_true", help="List what would be uploaded; do not upload.") args = parser.parse_args() s3 = boto3.client( "s3", region_name=REGION, aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], ) print(f"Target: s3://{BUCKET}/{PREFIX}") print(f"Region: {REGION}") print() files = _all_files() if not files: print("No files found in any source directory. Nothing to upload.") return 0 total_bytes = sum(f.stat().st_size for f in files) print(f"Found {len(files)} files, {total_bytes / 1024 / 1024:.1f} MB total.") print() if args.dry_run: for p in files: print(f" WOULD UPLOAD {_s3_key(p)} ({p.stat().st_size / 1024:.1f} KB)") print() print("Dry-run only. Run without --dry-run to upload.") return 0 uploaded = 0 skipped = 0 bytes_sent = 0 width = max(len(_s3_key(f)) for f in files) for p in files: key = _s3_key(p) local_size = p.stat().st_size if not args.force: remote = _remote_size(s3, key) if remote == local_size: print(f" SKIP {key:<{width}} ({local_size / 1024:>7.1f} KB)") skipped += 1 continue try: s3.upload_file(str(p), BUCKET, key) print(f" PUT {key:<{width}} ({local_size / 1024:>7.1f} KB)") uploaded += 1 bytes_sent += local_size except ClientError as e: print(f" ERROR {key} :: {e.response['Error']['Code']} {e.response['Error']['Message']}", file=sys.stderr) return 1 print() print(f"Done. uploaded={uploaded} skipped={skipped} bytes_sent={bytes_sent / 1024 / 1024:.1f} MB") return 0 if __name__ == "__main__": raise SystemExit(main())