Spaces:

codelion
/

dhara-chat

Running

App Files Files Community

dhara-chat / app.py

codelion

Multi-turn UX: new-chat button; example click resets conversation; 1k-token sliding-window history; 4th example (sleep)

a284b48 verified 8 days ago

raw

history blame contribute delete

12.8 kB

	#!/usr/bin/env python3
	"""dhara-chat — CPU demo of the dhara-250M tri-mode model.

	"Denoiser" terminal aesthetic. Chat lets you pick the decoding mode (AR types
	left-to-right; diffusion visibly unmasks ▓▒░ blocks). The tri-mode compare tab
	streams all three modes so you can watch AR type, block-diffusion denoise, and
	self-speculation jump in accepted spans — each with tokens/sec.
	"""

	import os, re, time, threading
	import torch
	import torch.nn.functional as F
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	MODEL_ID = os.environ.get("DHARA_MODEL", "codelion/dhara-250m")
	TOKEN = os.environ.get("HF_TOKEN")

	tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=TOKEN)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, trust_remote_code=True, torch_dtype=torch.float32, token=TOKEN).eval()
	# INT8 dynamic quant of Linear layers -> ~2x faster on CPU; quality preserved (verified across all 3 modes)
	model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8).eval()
	IM_END = tok.convert_tokens_to_ids("<\|im_end\|>")
	MASK = int(model.config.mask_token_id)
	GEN = dict(do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.2, no_repeat_ngram_size=3)
	GEN_GREEDY = dict(do_sample=False, repetition_penalty=1.3, no_repeat_ngram_size=3)
	REP = 1.3 # repetition penalty for diffusion/self-spec unmasking (prevents "capital capital" collapse)


	def _msg_text(c):
	# gradio 6 Chatbot stores content as a list of parts; flatten to plain text for the template
	if isinstance(c, list):
	return "".join((p.get("text") or "") if isinstance(p, dict) else str(p) for p in c)
	return c if isinstance(c, str) else str(c)


	def _enc(messages, max_tok=1024):
	msgs = [{"role": m.get("role", "user"), "content": _msg_text(m.get("content", ""))} for m in messages]
	while True: # sliding window: drop oldest turns until the prompt fits the 1k budget
	p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
	e = tok(p, return_tensors="pt", add_special_tokens=False)
	if e.input_ids.shape[1] <= max_tok or len(msgs) <= 1:
	return e.input_ids, e.attention_mask
	msgs = msgs[1:]


	def _ntok(text):
	return len(tok(text, add_special_tokens=False).input_ids)


	def _block_mask(S, bl, dev, dt):
	idx = torch.arange(S, device=dev)
	allowed = (idx // bl).unsqueeze(0) <= (idx // bl).unsqueeze(1)
	return torch.zeros((S, S), device=dev, dtype=dt).masked_fill(~allowed, float("-inf"))[None, None]


	def _rep_pen(logits, seen, penalty=REP):
	if penalty == 1.0 or seen.numel() == 0:
	return logits
	u = torch.unique(seen)
	s = logits[:, u]
	logits[:, u] = torch.where(s > 0, s / penalty, s * penalty)
	return logits


	def _clip(text):
	"""Trim a trailing incomplete sentence so responses never end mid-word."""
	t = text.rstrip()
	ends = [m.start() for m in re.finditer(r"[.!?](?=\s\|$)", t)]
	if ends and ends[-1] >= 16:
	t = t[:ends[-1] + 1]
	return re.sub(r"\s+\d+\.$", "", t).rstrip()


	def _render(row, prompt_len):
	out = []
	for t in row[prompt_len:]:
	if t == MASK:
	out.append("▒")
	elif t == IM_END:
	break
	else:
	out.append(tok.decode([t]))
	return "".join(out)


	def _ar_stream(ids, am, max_new, gen=GEN):
	streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
	threading.Thread(target=model.generate, kwargs=dict(
	input_ids=ids, attention_mask=am, max_new_tokens=max_new, eos_token_id=IM_END,
	pad_token_id=IM_END, streamer=streamer, **gen)).start()
	out = ""
	for tk_ in streamer:
	out += tk_
	yield out


	@torch.no_grad()
	def _diffusion_stream(ids, block_len=32, threshold=0.5, max_new=64):
	dev = ids.device; dt = next(model.parameters()).dtype
	cur = ids; gen = 0
	while gen < max_new:
	seq = torch.cat([cur, torch.full((1, block_len), MASK, device=dev)], 1); S = seq.shape[1]
	bias = _block_mask(S, block_len, dev, dt)
	for _ in range(block_len):
	mp = (seq[0] == MASK).nonzero(as_tuple=True)[0]
	if mp.numel() == 0:
	break
	lg = model(input_ids=seq, trimode_bias=bias).logits[0].float()
	lgm = _rep_pen(lg[mp], seq[0][seq[0] != MASK])
	conf, pred = F.softmax(lgm, -1).max(-1)
	take = conf >= threshold
	if take.sum() == 0:
	take[conf.argmax()] = True
	seq[0, mp[take]] = pred[take]
	yield _render(seq[0].tolist(), ids.shape[1])
	cur = seq; gen += block_len
	if (cur[0, -block_len:] == IM_END).any():
	break


	@torch.no_grad()
	def _selfspec_stream(ids, k=8, block_len=32, max_new=48):
	dev = ids.device; dt = next(model.parameters()).dtype
	cur = ids; gen = 0
	while gen < max_new:
	n = cur.shape[1]
	seq = torch.cat([cur, torch.full((1, k), MASK, device=dev)], 1); S = seq.shape[1]
	seen = cur[0]
	dl = model(input_ids=seq, trimode_bias=_block_mask(S, block_len, dev, dt)).logits[0].float()
	draft = _rep_pen(dl[n:n + k], seen).argmax(-1)
	cand = torch.cat([cur, draft.unsqueeze(0)], 1)
	al = model(input_ids=cand).logits[0].float()
	ar_pred = _rep_pen(al[n - 1:n + k - 1], seen).argmax(-1)
	match = (draft == ar_pred)
	m = int((~match).float().argmax().item()) if (~match).any() else k
	new = torch.cat([draft[:m], ar_pred[m:m + 1]]) if m < k else torch.cat([draft, al[n + k - 1:n + k].argmax(-1)])
	cur = torch.cat([cur, new.unsqueeze(0)], 1); gen += new.numel()
	yield tok.decode(cur[0, ids.shape[1]:], skip_special_tokens=True)
	if IM_END in new.tolist():
	break


	def _pane(label, speed, text):
	sp = f" <span class='dh-s'>{speed}</span>" if speed else ""
	return f"<div class='dh-pane'><span class='dh-k'>{label}</span>{sp}<div class='dh-t'>{text}</div></div>"


	def _pending(label):
	return f"<div class='dh-pane dh-run'><span class='dh-k'>{label}</span> <span class='dh-mask'>▓▒░ waiting…</span></div>"


	def compare(prompt):
	ids, am = _enc([{"role": "user", "content": prompt}])
	a = d = s = ""
	t0 = time.time()
	for a in _ar_stream(ids, am, 48, GEN_GREEDY):
	yield _pane("AR", "⟳", a), _pending("BLOCK-DIFFUSION"), _pending("SELF-SPEC")
	a = _clip(a); a_s = f"{_ntok(a)/max(1e-9,time.time()-t0):.1f} tok/s"
	t0 = time.time()
	for d in _diffusion_stream(ids, max_new=48):
	yield _pane("AR", a_s, a), _pane("BLOCK-DIFFUSION", "⟳", d), _pending("SELF-SPEC")
	d = _clip(d); d_s = f"{_ntok(d)/max(1e-9,time.time()-t0):.1f} tok/s"
	t0 = time.time()
	for s in _selfspec_stream(ids, max_new=48):
	yield _pane("AR", a_s, a), _pane("BLOCK-DIFFUSION", d_s, d), _pane("SELF-SPEC", "⟳", s)
	s = _clip(s); s_s = f"{_ntok(s)/max(1e-9,time.time()-t0):.1f} tok/s · AR-quality"
	yield _pane("AR", a_s, a), _pane("BLOCK-DIFFUSION", d_s, d), _pane("SELF-SPEC", s_s, s)


	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&display=swap');
	body, .gradio-container, gradio-app { background:#0b0b0c !important; color:#e9e2d4 !important;
	font-family:'IBM Plex Mono', ui-monospace, monospace !important; }
	.gradio-container { max-width:920px !important; margin:0 auto !important; }
	.dhara-hero { text-align:center; padding:30px 12px 14px; border-bottom:1px solid #26261b; }
	.dhara-glyphs { color:#ffb000; opacity:.3; letter-spacing:.55em; font-size:13px; animation:dh-diffuse 3.4s ease-in-out infinite; }
	.dhara-title { font-size:56px; font-weight:600; letter-spacing:.2em; color:#ffb000; text-shadow:0 0 22px rgba(255,176,0,.4); margin:4px 0 2px; }
	.dhara-tag { color:#8f8a78; font-size:12.5px; letter-spacing:.14em; text-transform:uppercase; }
	@keyframes dh-diffuse { 0%,100%{opacity:.14; filter:blur(.7px)} 50%{opacity:.46; filter:blur(0)} }
	button.primary, button[variant="primary"], .primary { background:#1c1609 !important; color:#ffb000 !important;
	border:1px solid #ffb000 !important; box-shadow:none !important; text-transform:uppercase; letter-spacing:.08em; }
	.dh-pane { border:1px solid #26261b; border-left:2px solid #ffb000; border-radius:6px; padding:12px 15px; background:#0f0f10; margin-bottom:10px; }
	.dh-k { color:#ffb000; font-weight:600; letter-spacing:.1em; }
	.dh-s { color:#8f8a78; font-size:12px; }
	.dh-t { white-space:pre-wrap; margin-top:8px; color:#e9e2d4; }
	.dh-mask { color:#7a7568; animation:dh-diffuse 1.1s ease-in-out infinite; }
	.dh-chat { height:400px !important; max-height:400px !important; overflow-y:auto !important; }
	.dh-ctl { gap:14px; align-items:flex-end; margin-bottom:6px; }
	.dhara-link { margin-top:8px; font-size:12px; }
	.dhara-link a { color:#ffb000; text-decoration:none; border-bottom:1px dotted #ffb000; }
	footer { display:none !important; }
	"""

	HEADER = """
	<div class="dhara-hero">
	<div class="dhara-glyphs">▓▒░ ▒▓░ ░▒▓ ▒░▓ ▓░▒ ░▓▒</div>
	<div class="dhara-title">dhara</div>
	<div class="dhara-tag">tri-mode · 250M · ar / block-diffusion / self-speculation</div>
	<div class="dhara-link">model: <a href="https://huggingface.co/codelion/dhara-250m" target="_blank">codelion/dhara-250m ↗</a></div>
	</div>
	"""

	with gr.Blocks(title="dhara-chat") as demo:
	gr.HTML(HEADER)
	with gr.Tab("chat"):
	with gr.Row(elem_classes="dh-ctl"):
	mode = gr.Radio(["AR", "diffusion", "self-spec"], value="AR", label="mode (diffusion shows ▓▒░ denoising; self-spec = AR-quality)", scale=3)
	max_new = gr.Number(value=80, label="max tokens", precision=0, minimum=16, maximum=256, scale=1)
	temp = gr.Number(value=0.7, label="temp (AR)", minimum=0.1, maximum=1.5, scale=1)
	chatbot = gr.Chatbot(height=400, show_label=False, elem_classes="dh-chat")
	with gr.Row():
	msg = gr.Textbox(placeholder="Message dhara…", show_label=False, container=False, scale=7)
	send = gr.Button("send", variant="primary", scale=1)
	clear = gr.Button("new chat", scale=1)
	gr.Examples(["Write a short paragraph about the ocean.",
	"Explain what a neural network is in simple terms.",
	"What is exercise good for?",
	"Why is sleep important?"],
	inputs=msg, outputs=chatbot, fn=lambda _q: [], run_on_click=True,
	cache_examples=False, label="try a prompt (starts a new chat)")

	def _user(message, history):
	return "", (history or []) + [{"role": "user", "content": message}]

	def _bot(history, mode, max_new, temperature):
	ids, am = _enc(history)
	last = ""
	if mode == "diffusion":
	for last in _diffusion_stream(ids, max_new=int(max_new)):
	yield history + [{"role": "assistant", "content": last}]
	elif mode == "self-spec":
	for last in _selfspec_stream(ids, max_new=int(max_new)):
	yield history + [{"role": "assistant", "content": last}]
	else:
	for last in _ar_stream(ids, am, int(max_new), dict(GEN, temperature=float(temperature))):
	yield history + [{"role": "assistant", "content": last}]
	yield history + [{"role": "assistant", "content": _clip(last)}]

	msg.submit(_user, [msg, chatbot], [msg, chatbot]).then(_bot, [chatbot, mode, max_new, temp], chatbot)
	send.click(_user, [msg, chatbot], [msg, chatbot]).then(_bot, [chatbot, mode, max_new, temp], chatbot)
	clear.click(lambda: ([], ""), None, [chatbot, msg])
	with gr.Tab("tri-mode compare"):
	gr.Markdown("Same prompt, three decoding modes — watch AR type, block-diffusion denoise `▓▒░`, and self-spec jump in accepted spans. \n<span style='color:#8f8a78'>config: greedy · repetition-penalty 1.3 · 48 tokens/mode · clipped to last full sentence</span>")
	inp = gr.Textbox(label="prompt", value="Write a short paragraph about the ocean.")
	gr.Examples(["Write a short paragraph about the ocean.",
	"Explain what a neural network is in simple terms.",
	"What is exercise good for?"],
	inputs=inp)
	btn = gr.Button("run all 3 modes", variant="primary")
	o1 = gr.HTML(); o2 = gr.HTML(); o3 = gr.HTML()
	btn.click(compare, inp, [o1, o2, o3])

	if __name__ == "__main__":
	demo.queue().launch(server_name="0.0.0.0", server_port=7860,
	theme=gr.themes.Base(font=gr.themes.GoogleFont("IBM Plex Mono")), css=CSS)