mumble-cleanup / scripts /push_code_to_hub.py
adikuma's picture
initial upload: cleanup code and 688-pair seed dataset
fd0b01f verified
"""push the cleanup code + seed dataset to a fresh huggingface hub repo.
does not need a trained model. uploads:
- src/, scripts/, configs/, tests/, data/, docs/
- pyproject.toml, Makefile
- docs/model_card.md mirrored to README.md at repo root
- existing cleanup/README.md renamed to DEVELOPMENT.md
reads the token from the mumble repo root .env.local. prefers
HUGGINGFACE_ACCESS_TOKEN, falls back to HF_TOKEN.
usage: python models/cleanup/scripts/push_code_to_hub.py [--repo NAME] [--private]
"""
import argparse
import shutil
import sys
import tempfile
from pathlib import Path
from huggingface_hub import HfApi
from huggingface_hub.errors import HfHubHTTPError
SKIP_DIRS = {
".venv",
"runs",
"dist",
"__pycache__",
".pytest_cache",
".mypy_cache",
".ruff_cache",
"node_modules",
".uv",
}
SKIP_SUFFIXES = {".pyc", ".pyo"}
def find_repo_root(start: Path) -> Path:
for p in [start, *start.parents]:
if (p / ".env.local").exists():
return p
raise FileNotFoundError(".env.local not found in any parent of " + str(start))
def load_token(env_path: Path) -> str:
for raw in env_path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key = key.strip()
value = value.strip().strip('"').strip("'")
if key in {"HUGGINGFACE_ACCESS_TOKEN", "HF_TOKEN"} and value:
return value
raise KeyError("no HUGGINGFACE_ACCESS_TOKEN or HF_TOKEN in " + str(env_path))
def stage_upload(source: Path, staging: Path) -> int:
count = 0
for path in source.rglob("*"):
if path.is_dir():
continue
rel = path.relative_to(source)
if SKIP_DIRS & set(rel.parts):
continue
if path.suffix in SKIP_SUFFIXES:
continue
if rel == Path("README.md"):
target_rel = Path("DEVELOPMENT.md")
else:
target_rel = rel
target = staging / target_rel
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(path, target)
count += 1
model_card = source / "docs" / "model_card.md"
if model_card.exists():
shutil.copy2(model_card, staging / "README.md")
count += 1
else:
print("warn: docs/model_card.md missing; hub page will lack a README", file=sys.stderr)
(staging / ".gitattributes").write_text(
"*.jsonl text\n*.csv text\n*.md text\n*.py text\n*.yaml text\n*.toml text\n",
encoding="utf-8",
)
return count + 1
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--repo", default="mumble-cleanup", help="repo name under the authed user")
parser.add_argument("--private", action="store_true", help="create as private (default public)")
args = parser.parse_args()
here = Path(__file__).resolve()
repo_root = find_repo_root(here.parent)
cleanup_dir = repo_root / "models" / "cleanup"
env_path = repo_root / ".env.local"
if not cleanup_dir.exists():
print(f"error: {cleanup_dir} does not exist", file=sys.stderr)
return 1
token = load_token(env_path)
api = HfApi(token=token)
me = api.whoami()
user = me.get("name") or (me.get("email") or "").split("@")[0]
if not user:
print("error: could not resolve hf username from whoami()", file=sys.stderr)
return 1
repo_id = f"{user}/{args.repo}"
print(f"hf user : {user}")
print(f"repo target : {repo_id}")
print(f"visibility : {'private' if args.private else 'public'}")
print(f"source : {cleanup_dir}")
try:
url = api.create_repo(
repo_id=repo_id,
private=args.private,
repo_type="model",
exist_ok=True,
)
print(f"repo ready : {url}")
except HfHubHTTPError as exc:
print(f"error: failed to create repo: {exc}", file=sys.stderr)
return 1
with tempfile.TemporaryDirectory(prefix="mumble-cleanup-hub-") as tmp:
staging = Path(tmp)
count = stage_upload(cleanup_dir, staging)
print(f"staged files : {count}")
print("uploading ...")
api.upload_folder(
folder_path=str(staging),
repo_id=repo_id,
repo_type="model",
commit_message="initial upload: cleanup code and 688-pair seed dataset",
)
print(f"\ndone. browse: https://huggingface.co/{repo_id}")
return 0
if __name__ == "__main__":
raise SystemExit(main())