File size: 4,652 Bytes
fd0b01f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""push the cleanup code + seed dataset to a fresh huggingface hub repo.

does not need a trained model. uploads:
  - src/, scripts/, configs/, tests/, data/, docs/
  - pyproject.toml, Makefile
  - docs/model_card.md mirrored to README.md at repo root
  - existing cleanup/README.md renamed to DEVELOPMENT.md

reads the token from the mumble repo root .env.local. prefers
HUGGINGFACE_ACCESS_TOKEN, falls back to HF_TOKEN.

usage: python models/cleanup/scripts/push_code_to_hub.py [--repo NAME] [--private]
"""

import argparse
import shutil
import sys
import tempfile
from pathlib import Path

from huggingface_hub import HfApi
from huggingface_hub.errors import HfHubHTTPError

SKIP_DIRS = {
    ".venv",
    "runs",
    "dist",
    "__pycache__",
    ".pytest_cache",
    ".mypy_cache",
    ".ruff_cache",
    "node_modules",
    ".uv",
}
SKIP_SUFFIXES = {".pyc", ".pyo"}


def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / ".env.local").exists():
            return p
    raise FileNotFoundError(".env.local not found in any parent of " + str(start))


def load_token(env_path: Path) -> str:
    for raw in env_path.read_text(encoding="utf-8").splitlines():
        line = raw.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, _, value = line.partition("=")
        key = key.strip()
        value = value.strip().strip('"').strip("'")
        if key in {"HUGGINGFACE_ACCESS_TOKEN", "HF_TOKEN"} and value:
            return value
    raise KeyError("no HUGGINGFACE_ACCESS_TOKEN or HF_TOKEN in " + str(env_path))


def stage_upload(source: Path, staging: Path) -> int:
    count = 0
    for path in source.rglob("*"):
        if path.is_dir():
            continue
        rel = path.relative_to(source)
        if SKIP_DIRS & set(rel.parts):
            continue
        if path.suffix in SKIP_SUFFIXES:
            continue
        if rel == Path("README.md"):
            target_rel = Path("DEVELOPMENT.md")
        else:
            target_rel = rel
        target = staging / target_rel
        target.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(path, target)
        count += 1

    model_card = source / "docs" / "model_card.md"
    if model_card.exists():
        shutil.copy2(model_card, staging / "README.md")
        count += 1
    else:
        print("warn: docs/model_card.md missing; hub page will lack a README", file=sys.stderr)

    (staging / ".gitattributes").write_text(
        "*.jsonl text\n*.csv text\n*.md text\n*.py text\n*.yaml text\n*.toml text\n",
        encoding="utf-8",
    )
    return count + 1


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo", default="mumble-cleanup", help="repo name under the authed user")
    parser.add_argument("--private", action="store_true", help="create as private (default public)")
    args = parser.parse_args()

    here = Path(__file__).resolve()
    repo_root = find_repo_root(here.parent)
    cleanup_dir = repo_root / "models" / "cleanup"
    env_path = repo_root / ".env.local"

    if not cleanup_dir.exists():
        print(f"error: {cleanup_dir} does not exist", file=sys.stderr)
        return 1

    token = load_token(env_path)
    api = HfApi(token=token)
    me = api.whoami()
    user = me.get("name") or (me.get("email") or "").split("@")[0]
    if not user:
        print("error: could not resolve hf username from whoami()", file=sys.stderr)
        return 1

    repo_id = f"{user}/{args.repo}"
    print(f"hf user      : {user}")
    print(f"repo target  : {repo_id}")
    print(f"visibility   : {'private' if args.private else 'public'}")
    print(f"source       : {cleanup_dir}")

    try:
        url = api.create_repo(
            repo_id=repo_id,
            private=args.private,
            repo_type="model",
            exist_ok=True,
        )
        print(f"repo ready   : {url}")
    except HfHubHTTPError as exc:
        print(f"error: failed to create repo: {exc}", file=sys.stderr)
        return 1

    with tempfile.TemporaryDirectory(prefix="mumble-cleanup-hub-") as tmp:
        staging = Path(tmp)
        count = stage_upload(cleanup_dir, staging)
        print(f"staged files : {count}")

        print("uploading ...")
        api.upload_folder(
            folder_path=str(staging),
            repo_id=repo_id,
            repo_type="model",
            commit_message="initial upload: cleanup code and 688-pair seed dataset",
        )

    print(f"\ndone. browse: https://huggingface.co/{repo_id}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())