"""push the cleanup code + seed dataset to a fresh huggingface hub repo. does not need a trained model. uploads: - src/, scripts/, configs/, tests/, data/, docs/ - pyproject.toml, Makefile - docs/model_card.md mirrored to README.md at repo root - existing cleanup/README.md renamed to DEVELOPMENT.md reads the token from the mumble repo root .env.local. prefers HUGGINGFACE_ACCESS_TOKEN, falls back to HF_TOKEN. usage: python models/cleanup/scripts/push_code_to_hub.py [--repo NAME] [--private] """ import argparse import shutil import sys import tempfile from pathlib import Path from huggingface_hub import HfApi from huggingface_hub.errors import HfHubHTTPError SKIP_DIRS = { ".venv", "runs", "dist", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache", "node_modules", ".uv", } SKIP_SUFFIXES = {".pyc", ".pyo"} def find_repo_root(start: Path) -> Path: for p in [start, *start.parents]: if (p / ".env.local").exists(): return p raise FileNotFoundError(".env.local not found in any parent of " + str(start)) def load_token(env_path: Path) -> str: for raw in env_path.read_text(encoding="utf-8").splitlines(): line = raw.strip() if not line or line.startswith("#") or "=" not in line: continue key, _, value = line.partition("=") key = key.strip() value = value.strip().strip('"').strip("'") if key in {"HUGGINGFACE_ACCESS_TOKEN", "HF_TOKEN"} and value: return value raise KeyError("no HUGGINGFACE_ACCESS_TOKEN or HF_TOKEN in " + str(env_path)) def stage_upload(source: Path, staging: Path) -> int: count = 0 for path in source.rglob("*"): if path.is_dir(): continue rel = path.relative_to(source) if SKIP_DIRS & set(rel.parts): continue if path.suffix in SKIP_SUFFIXES: continue if rel == Path("README.md"): target_rel = Path("DEVELOPMENT.md") else: target_rel = rel target = staging / target_rel target.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(path, target) count += 1 model_card = source / "docs" / "model_card.md" if model_card.exists(): shutil.copy2(model_card, staging / "README.md") count += 1 else: print("warn: docs/model_card.md missing; hub page will lack a README", file=sys.stderr) (staging / ".gitattributes").write_text( "*.jsonl text\n*.csv text\n*.md text\n*.py text\n*.yaml text\n*.toml text\n", encoding="utf-8", ) return count + 1 def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--repo", default="mumble-cleanup", help="repo name under the authed user") parser.add_argument("--private", action="store_true", help="create as private (default public)") args = parser.parse_args() here = Path(__file__).resolve() repo_root = find_repo_root(here.parent) cleanup_dir = repo_root / "models" / "cleanup" env_path = repo_root / ".env.local" if not cleanup_dir.exists(): print(f"error: {cleanup_dir} does not exist", file=sys.stderr) return 1 token = load_token(env_path) api = HfApi(token=token) me = api.whoami() user = me.get("name") or (me.get("email") or "").split("@")[0] if not user: print("error: could not resolve hf username from whoami()", file=sys.stderr) return 1 repo_id = f"{user}/{args.repo}" print(f"hf user : {user}") print(f"repo target : {repo_id}") print(f"visibility : {'private' if args.private else 'public'}") print(f"source : {cleanup_dir}") try: url = api.create_repo( repo_id=repo_id, private=args.private, repo_type="model", exist_ok=True, ) print(f"repo ready : {url}") except HfHubHTTPError as exc: print(f"error: failed to create repo: {exc}", file=sys.stderr) return 1 with tempfile.TemporaryDirectory(prefix="mumble-cleanup-hub-") as tmp: staging = Path(tmp) count = stage_upload(cleanup_dir, staging) print(f"staged files : {count}") print("uploading ...") api.upload_folder( folder_path=str(staging), repo_id=repo_id, repo_type="model", commit_message="initial upload: cleanup code and 688-pair seed dataset", ) print(f"\ndone. browse: https://huggingface.co/{repo_id}") return 0 if __name__ == "__main__": raise SystemExit(main())