Spaces:

sayehghp
/

vicca

Sleeping

App Files Files Community

sayehghp commited on 18 days ago

Commit

b94ff1b

1 Parent(s): d148884

Displaying attention image

Browse files

Files changed (4) hide show

app.py +72 -27
inference.py +56 -1
vg_token_attention.py +396 -0
vicca_api.py +28 -0

app.py CHANGED Viewed

@@ -5,45 +5,90 @@ import tempfile
 import gradio as gr
 from vicca_api import run_vicca
-def vicca_interface(image, text_prompt):
-    """
-    image: file from Gradio, we'll use its temp path
-    text_prompt: report / description
-    """
-    # Gradio passes a PIL image or a file path depending on type
-    # We'll request type='filepath' so this is already a path
-    image_path = image
     result = run_vicca(
-        image_path=image_path,
         text_prompt=text_prompt,
     )
-    # You could also return the best generated image as an image output
-    # For now, we expose the dict as JSON
-    return result
 demo = gr.Interface(
     fn=vicca_interface,
     inputs=[
-        gr.Image(type="filepath", label="Chest X-ray"),
-        gr.Textbox(label="Report / pathology description", lines=3),
     ],
-    outputs=gr.JSON(label="VICCA output"),
-    title="VICCA – Visual Interpretation & Comprehension",
-    description=(
-        "Upload a chest X-ray and provide a text report / pathology description. "
-        "The VICCA pipeline will run CXR generation, visual grounding, "
-        "and ROI-level similarity scoring."
-    ),
 )
 # if __name__ == "__main__":
 #     demo.launch()
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        debug=False
-    )

 import gradio as gr
 from vicca_api import run_vicca
+def vicca_interface(image, text_prompt, box_threshold=0.2, text_threshold=0.2, num_samples=4):
+    os.makedirs("uploads", exist_ok=True)
+    input_path = os.path.join("uploads", "input.png")
+    image.save(input_path)
     result = run_vicca(
+        image_path=input_path,
         text_prompt=text_prompt,
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+        num_samples=num_samples,
     )
+    best_gen = result.get("best_generated_image_path")
+    attn = result.get("attention_overlays") or {}
+    combined = attn.get("combined")
+    per_term_dict = attn.get("per_term") or {}
+    gallery_items = [(p, term) for term, p in per_term_dict.items()]
+    return best_gen, combined, gallery_items, result
 demo = gr.Interface(
     fn=vicca_interface,
     inputs=[
+        gr.Image(type="pil", label="Input CXR"),
+        gr.Textbox(lines=3, label="Text prompt"),
+        gr.Slider(0.0, 1.0, value=0.2, label="Box threshold"),
+        gr.Slider(0.0, 1.0, value=0.2, label="Text threshold"),
+        gr.Slider(1, 8, step=1, value=4, label="Number of samples"),
+    ],
+    outputs=[
+        gr.Image(label="Best generated CXR"),
+        gr.Image(label="Combined attention heatmap"),
+        gr.Gallery(label="Per-term overlays").style(grid=[3], height=400),
+        gr.JSON(label="Raw VICCA output"),
     ],
+    title="VICCA",
 )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, debug=False)
+# def vicca_interface(image, text_prompt):
+#     """
+#     image: file from Gradio, we'll use its temp path
+#     text_prompt: report / description
+#     """
+#     # Gradio passes a PIL image or a file path depending on type
+#     # We'll request type='filepath' so this is already a path
+#     image_path = image
+#     result = run_vicca(
+#         image_path=image_path,
+#         text_prompt=text_prompt,
+#     )
+#     # You could also return the best generated image as an image output
+#     # For now, we expose the dict as JSON
+#     return result
+# demo = gr.Interface(
+#     fn=vicca_interface,
+#     inputs=[
+#         gr.Image(type="filepath", label="Chest X-ray"),
+#         gr.Textbox(label="Report / pathology description", lines=3),
+#     ],
+#     outputs=gr.JSON(label="VICCA output"),
+#     title="VICCA – Visual Interpretation & Comprehension",
+#     description=(
+#         "Upload a chest X-ray and provide a text report / pathology description. "
+#         "The VICCA pipeline will run CXR generation, visual grounding, "
+#         "and ROI-level similarity scoring."
+#     ),
+# )
 # if __name__ == "__main__":
 #     demo.launch()
+# if __name__ == "__main__":
+#     demo.launch(
+#         server_name="0.0.0.0",
+#         server_port=7860,
+#         debug=False
+#     )

inference.py CHANGED Viewed

@@ -9,11 +9,13 @@ python inference.py \
 """
 import pandas as pd
 import time
 import cv2
 import sys
 import argparse
 from ast import literal_eval
 # sys.path.append('/home/gholipos-admin/Desktop/Thesis/Training_Code/VICCA')
 from pathlib import Path
@@ -101,7 +103,7 @@ from DETR import svc
 from DETR.arguments import get_args_parser as get_detr_args_parser
 from VG import localization
 from ssim import ssim
 def get_args_parser():
     parser = argparse.ArgumentParser('Set the Input', add_help=True)
@@ -120,6 +122,59 @@ def get_args_parser():
                         help="Path to save generated files.")
     return parser
 def extract_tensor(value):
     cleaned_value = value.replace('tensor(', '').replace(')', '')
     return literal_eval(cleaned_value)

 """
 import pandas as pd
+import numpy as np
 import time
 import cv2
 import sys
 import argparse
 from ast import literal_eval
+from nltk import tokenize
 # sys.path.append('/home/gholipos-admin/Desktop/Thesis/Training_Code/VICCA')
 from pathlib import Path
 from DETR.arguments import get_args_parser as get_detr_args_parser
 from VG import localization
 from ssim import ssim
+from CheXbert.src.label import label
 def get_args_parser():
     parser = argparse.ArgumentParser('Set the Input', add_help=True)
                         help="Path to save generated files.")
     return parser
+path_list = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
+            'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
+            'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
+            'Support Devices', 'No Finding']
+# Cache CheXbert weights once at import time
+CHEXBERT_WEIGHTS = get_weight("CheXbert/checkpoint/chexbert.pth")
+# def chexbert_pathology(text):
+#     sentences = list(set(tokenize.sent_tokenize(text)))
+#     path_dict = []
+#     for sentence in sentences:
+#         sentence = sentence.replace('\n',' ')
+#         sentence = sentence.replace('\s+',' ')
+#         chexbert_weight_path = get_weight("CheXbert/checkpoint/chexbert.pth")
+#         # pathology = np.array(label("CheXbert/checkpoint/chexbert.pth", sentence)).T[0]
+#         pathology = np.array(label(chexbert_weight_path, sentence)).T[0]
+#         if pathology[-1]==1 or len(list(set(pathology)))==1 or not any(e==1 for e in pathology):
+#             pass
+#         else:
+#             indice = [i for i, e in enumerate(pathology) if e==1]
+#             for ind in indice:
+#                 path_dict.append(path_list[ind])
+#     return path_dict
+def chexbert_pathology(text: str):
+    """
+    Run CheXbert on the text and return a list of *positive* pathology labels,
+    deduplicated.
+    """
+    # If NLTK punkt ever becomes a problem on Spaces, replace this with a simple split.
+    # sentences = list(set(tokenize.sent_tokenize(text)))
+    sentences = [s.strip() for s in text.split(".") if s.strip()]
+    path_terms = set()
+    for sentence in sentences:
+        sentence = sentence.replace("\n", " ")
+        sentence = sentence.replace("\s+", " ")
+        # Run CheXbert
+        pathology = np.array(label(CHEXBERT_WEIGHTS, sentence)).T[0]
+        # Skip if: "No Finding" active, or all labels same, or no positives
+        if pathology[-1] == 1 or len(set(pathology)) == 1 or not any(e == 1 for e in pathology):
+            continue
+        # Collect positive indices
+        indices = [i for i, e in enumerate(pathology) if e == 1]
+        for ind in indices:
+            path_terms.add(path_list[ind])
+    return sorted(path_terms)
 def extract_tensor(value):
     cleaned_value = value.replace('tensor(', '').replace(')', '')
     return literal_eval(cleaned_value)

vg_token_attention.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# vg_token_attention.py
+# -*- coding: utf-8 -*-
+"""
+Token→region cross-attention visualization for GroundingDINO integrated as a helper.
+Usage from other modules:
+    from vg_token_attention import run_token_ca_visualization
+    paths = run_token_ca_visualization(
+        cfg_path="VG/config/GroundingDINO_SwinT_OGC_2.py",
+        ckpt_path="VG/weights/checkpoint0399_log4.pth",
+        image_path=image_path,
+        prompt=text_prompt,
+        terms=chexbert_terms,           # e.g. ["edema", "effusion"]
+        out_dir="outputs/attn_overlays",
+        device="cuda" or "cpu",
+    )
+"""
+import os
+import math
+import re
+import cv2
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+from PIL import Image
+import torchvision.transforms as T
+from VG.groundingdino.util.inference import load_model
+from VG.groundingdino.util.misc import NestedTensor
+from transformers import AutoTokenizer
+DEVICE_DEFAULT = "cuda" if torch.cuda.is_available() else "cpu"
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD  = [0.229, 0.224, 0.225]
+# -----------------------------
+# Preprocess: PIL -> (tensor, mask)
+# -----------------------------
+def preprocess_image_fn_factory(device=DEVICE_DEFAULT, longest=1024, pad_divisor=32):
+    to_tensor = T.ToTensor()
+    normalize = T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    def _resize_longest(pil_img: Image.Image, longest_side=1024):
+        w, h = pil_img.size
+        scale = float(longest_side) / max(w, h)
+        new_w, new_h = int(round(w * scale)), int(round(h * scale))
+        return pil_img.resize((new_w, new_h), Image.BICUBIC)
+    def preprocess_image_fn(pil_img: Image.Image):
+        img_resized = _resize_longest(pil_img, longest_side=longest)
+        x = normalize(to_tensor(img_resized))  # [3,H,W]
+        _, H, W = x.shape
+        # pad to /32 for backbone
+        H_pad = math.ceil(H / pad_divisor) * pad_divisor
+        W_pad = math.ceil(W / pad_divisor) * pad_divisor
+        pad_h, pad_w = H_pad - H, W_pad - W
+        x = F.pad(x, (0, pad_w, 0, pad_h), value=0.0)  # [3,Hp,Wp]
+        # mask: True on padded pixels
+        mask = torch.zeros((H_pad, W_pad), dtype=torch.bool)
+        if pad_h > 0:
+            mask[H:, :] = True
+        if pad_w > 0:
+            mask[:, W:] = True
+        return x.unsqueeze(0).to(device), mask.unsqueeze(0).to(device)
+    return preprocess_image_fn
+# -----------------------------
+# Tokenizer (BiomedVLP-CXR-BERT)
+# -----------------------------
+BIOMEDVLP_TOKENIZER_PATH = "VG/weights/BiomedVLP-CXR-BERT/"
+_tokenizer = AutoTokenizer.from_pretrained(BIOMEDVLP_TOKENIZER_PATH)
+def tokenize_with_offsets(prompt: str, device=DEVICE_DEFAULT):
+    enc = _tokenizer(
+        prompt,
+        return_tensors="pt",
+        return_offsets_mapping=True,
+        add_special_tokens=True,
+        truncation=True,
+    )
+    tokens = _tokenizer.convert_ids_to_tokens(enc["input_ids"][0])
+    offsets = enc["offset_mapping"][0].tolist()
+    return {
+        "input_ids": enc["input_ids"].to(device),
+        "attention_mask": enc["attention_mask"].to(device),
+        "tokens": tokens,
+        "offsets": offsets,
+    }
+def find_token_span_by_offsets(prompt: str, offsets, term: str):
+    s = prompt.lower()
+    t = term.lower()
+    m = re.search(r'\b' + re.escape(t) + r'\b', s) or re.search(re.escape(t), s)
+    if not m:
+        return []
+    a, b = m.start(), m.end()
+    idxs = []
+    for i, (u, v) in enumerate(offsets):
+        if (
+            u is None or v is None or
+            u < 0 or v < 0 or
+            (u == 0 and v == 0)
+        ):
+            continue
+        if not (v <= a or u >= b):  # overlap with [a,b)
+            idxs.append(i)
+    return idxs
+def model_span_indices_for_term(tokens, offsets, attn_T, term: str):
+    # 1) HF indices by offsets
+    raw_hf_idxs = find_token_span_by_offsets(
+        "".join(t if t != "[PAD]" else " " for t in tokens),
+        offsets,
+        term
+    )
+    if not raw_hf_idxs:
+        low = term.lower()
+        raw_hf_idxs = [i for i, t in enumerate(tokens) if low in t.lower()]
+    # 2) Map HF non-special → model positions 0..T-1
+    non_special_hf = []
+    for i, (tok_i, (u, v)) in enumerate(zip(tokens, offsets)):
+        if tok_i in ("[CLS]", "[SEP]", "[PAD]"):
+            continue
+        if u is None or v is None or u < 0 or v < 0 or (u == 0 and v == 0):
+            continue
+        non_special_hf.append(i)
+    non_special_hf = non_special_hf[:attn_T]
+    hf2model = {hf_idx: j for j, hf_idx in enumerate(non_special_hf)}
+    model_term_idxs = [hf2model[i] for i in raw_hf_idxs if i in hf2model]
+    return torch.tensor(model_term_idxs, dtype=torch.long)
+# -----------------------------
+# Cross-attention recorder
+# -----------------------------
+class CrossAttnRecorder:
+    def __init__(self, decoder_layers, attn_attr_name='ca_text'):
+        self.attn_weights = []  # list of [B, heads, Q, T]
+        self.handles = []
+        self._register(decoder_layers, attn_attr_name)
+    def _hook(self, module, input, output):
+        if isinstance(output, tuple) and len(output) >= 2:
+            attn_w = output[1]
+        elif hasattr(module, 'attn_output_weights'):
+            attn_w = module.attn_output_weights
+        else:
+            attn_w = None
+        if attn_w is not None:
+            self.attn_weights.append(attn_w.detach().to('cpu', dtype=torch.float32))
+    def _wrap_forward(self, mha_module: nn.MultiheadAttention):
+        orig_forward = mha_module.forward
+        def wrapped_forward(*args, **kwargs):
+            kwargs['need_weights'] = True
+            kwargs['average_attn_weights'] = False
+            return orig_forward(*args, **kwargs)
+        return orig_forward, wrapped_forward
+    def _register(self, decoder_layers, attn_attr_name):
+        for layer in decoder_layers:
+            attn_module = getattr(layer, attn_attr_name, None)
+            if attn_module is None:
+                continue
+            if isinstance(attn_module, nn.MultiheadAttention):
+                orig_fwd, wrapped = self._wrap_forward(attn_module)
+                attn_module.forward = wrapped
+                handle = attn_module.register_forward_hook(self._hook)
+                self.handles.append((attn_module, handle, orig_fwd))
+            else:
+                handle = attn_module.register_forward_hook(self._hook)
+                self.handles.append((attn_module, handle, None))
+    def close(self):
+        for attn_module, handle, orig_fwd in self.handles:
+            handle.remove()
+            if (orig_fwd is not None) and isinstance(attn_module, nn.MultiheadAttention):
+                attn_module.forward = orig_fwd
+# -----------------------------
+# Heatmap helpers
+# -----------------------------
+def boxes_to_heatmap(boxes_xyxy, weights, hw, score_scale=None, blur_ksize=51, blur_sigma=0):
+    H, W = hw
+    heat = np.zeros((H, W), dtype=np.float32)
+    w = weights.detach().cpu().numpy()
+    if score_scale is not None:
+        s = score_scale.detach().cpu().numpy()
+        w = w * s
+    for i, box in enumerate(boxes_xyxy):
+        x1, y1, x2, y2 = map(int, box.tolist())
+        x1 = max(0, min(W - 1, x1)); x2 = max(0, min(W - 1, x2))
+        y1 = max(0, min(H - 1, y1)); y2 = max(0, min(H - 1, y2))
+        if x2 <= x1 or y2 <= y1:
+            continue
+        heat[y1:y2, x1:x2] += float(w[i])
+    if blur_ksize is not None and blur_ksize >= 3 and blur_ksize % 2 == 1:
+        heat = cv2.GaussianBlur(heat, (blur_ksize, blur_ksize), blur_sigma)
+    mx = heat.max()
+    if mx > 1e-6:
+        heat /= mx
+    return heat
+def overlay_heatmap(img_pil: Image.Image, heatmap, alpha=0.45, cmap=cv2.COLORMAP_JET):
+    img = np.array(img_pil.convert("RGB"))
+    H, W = img.shape[:2]
+    h = (np.clip(heatmap, 0, 1) * 255).astype(np.uint8)
+    h_color = cv2.applyColorMap(h, cmap)[:, :, ::-1]
+    blended = cv2.addWeighted(h_color, alpha, img, 1 - alpha, 0)
+    return Image.fromarray(blended)
+def load_image_keep_longest(path, longest=1024):
+    img = Image.open(path).convert("RGB")
+    w, h = img.size
+    s = float(longest) / max(w, h)
+    new_w, new_h = int(round(w * s)), int(round(h * s))
+    return img.resize((new_w, new_h), Image.BICUBIC)
+# -----------------------------
+# Main helper: one call from API
+# -----------------------------
+@torch.no_grad()
+def run_token_ca_visualization(
+    cfg_path: str,
+    ckpt_path: str,
+    image_path: str,
+    prompt: str,
+    terms,
+    out_dir: str,
+    device: str = DEVICE_DEFAULT,
+    score_thresh: float = 0.25,
+    topk: int = 100,
+    term_agg: str = "mean",      # "mean" | "max" | "sum"
+    save_per_term: bool = True,
+):
+    """
+    Returns:
+        {
+          "combined": <path_to_combined_overlay>,
+          "per_term": { term: path_to_overlay, ... }
+        }
+    """
+    if isinstance(terms, str):
+        terms = [terms]
+    terms = [t.strip() for t in terms if t and t.strip()]
+    if not terms:
+        raise ValueError("No terms provided for attention visualization.")
+    device = device or DEVICE_DEFAULT
+    model = load_model(cfg_path, ckpt_path).to(device).eval()
+    preprocess_image_fn = preprocess_image_fn_factory(device=device, longest=1024, pad_divisor=32)
+    img_pil = load_image_keep_longest(image_path, longest=1024)
+    os.makedirs(out_dir, exist_ok=True)
+    base_name = os.path.splitext(os.path.basename(image_path))[0]
+    combined_path = os.path.join(out_dir, f"{base_name}__attn_combined.png")
+    # ---- hook cross-attn
+    decoder_layers = model.transformer.decoder.layers
+    recorder = CrossAttnRecorder(decoder_layers, attn_attr_name="ca_text")
+    # preprocess → NestedTensor
+    img_tensor, mask = preprocess_image_fn(img_pil)
+    samples = NestedTensor(img_tensor, mask)
+    outputs = model(samples, captions=[prompt])
+    # decode boxes
+    pred_logits = outputs["pred_logits"]
+    pred_boxes = outputs["pred_boxes"]
+    logits = pred_logits[0].sigmoid()
+    scores, _ = logits.max(dim=1)
+    keep = torch.nonzero(scores > score_thresh).squeeze(1)
+    if keep.numel() == 0:
+        keep = torch.argsort(scores, descending=True)[:min(topk, scores.numel())]
+    else:
+        keep = keep[:topk]
+    W, H = img_pil.size
+    boxes_cxcywh = pred_boxes[0][keep]
+    cx, cy, w, h = boxes_cxcywh.unbind(-1)
+    x1 = (cx - 0.5 * w) * W
+    y1 = (cy - 0.5 * h) * H
+    x2 = (cx + 0.5 * w) * W
+    y2 = (cy + 0.5 * h) * H
+    boxes_xyxy = torch.stack([x1, y1, x2, y2], dim=-1)
+    kept_scores = scores[keep]
+    keep_cpu = keep.cpu()
+    if len(recorder.attn_weights) == 0:
+        recorder.close()
+        raise RuntimeError("No attention weights captured. Check that 'ca_text' exists.")
+    attn_qt_layers = []
+    for w_att in recorder.attn_weights:
+        w_att = w_att.squeeze(0).mean(0)  # [Q,T]
+        attn_qt_layers.append(w_att)
+    attn_qt = torch.stack(attn_qt_layers, 0).mean(0)  # [Q,T]
+    recorder.close()
+    # tokenize prompt
+    tok = tokenize_with_offsets(prompt, device="cpu")
+    tokens, offsets = tok["tokens"], tok["offsets"]
+    T_text = attn_qt.shape[1]
+    per_term_attn_kept = {}
+    per_term_attn_full = {}
+    for t in terms:
+        model_idxs = model_span_indices_for_term(tokens, offsets, T_text, t)
+        if model_idxs.numel() == 0:
+            continue
+        attn_per_query = attn_qt[:, model_idxs].mean(1)   # [Q]
+        attn_kept = attn_per_query[keep_cpu]
+        attn_kept = (attn_kept - attn_kept.min()) / (attn_kept.max() - attn_kept.min() + 1e-6)
+        per_term_attn_kept[t] = attn_kept
+        per_term_attn_full[t] = attn_per_query
+    if not per_term_attn_kept:
+        raise ValueError(f"None of the terms were found in the first T tokens: {terms}")
+    # aggregate terms
+    agg = None
+    for t, v in per_term_attn_full.items():
+        agg = v if agg is None else (
+            agg + v if term_agg == "sum"
+            else torch.maximum(agg, v) if term_agg == "max"
+            else (agg + v)
+        )
+    if term_agg == "mean":
+        agg = agg / float(len(per_term_attn_full))
+    agg_kept = agg[keep_cpu]
+    agg_kept = (agg_kept - agg_kept.min()) / (agg_kept.max() - agg_kept.min() + 1e-6)
+    heat = boxes_to_heatmap(
+        boxes_xyxy=boxes_xyxy,
+        weights=agg_kept,
+        hw=(H, W),
+        score_scale=kept_scores,
+        blur_ksize=61,
+        blur_sigma=0,
+    )
+    overlay = overlay_heatmap(img_pil, heat, alpha=0.45)
+    overlay.save(combined_path)
+    per_term_paths = {}
+    if save_per_term and len(per_term_attn_kept) > 1:
+        for t, v in per_term_attn_kept.items():
+            heat_t = boxes_to_heatmap(
+                boxes_xyxy=boxes_xyxy,
+                weights=v,
+                hw=(H, W),
+                score_scale=kept_scores,
+                blur_ksize=61,
+                blur_sigma=0,
+            )
+            ov_t = overlay_heatmap(img_pil, heat_t, alpha=0.45)
+            term_tag = re.sub(r"[^a-zA-Z0-9]+", "_", t.lower())[:32]
+            p = os.path.join(out_dir, f"{base_name}__{term_tag}.png")
+            ov_t.save(p)
+            per_term_paths[t] = p
+    return {
+        "combined": combined_path,
+        "per_term": per_term_paths,
+    }

vicca_api.py CHANGED Viewed

@@ -3,8 +3,11 @@ import os
 import time
 import cv2
 import pandas as pd
 from weights_utils import ensure_all_vicca_weights, get_weight
 # Make sure all heavy weights are present once per container
 ensure_all_vicca_weights()
@@ -14,6 +17,7 @@ from inference import (
     cal_shift,
     get_local_bbox,
     extract_tensor,
 )
 def run_vicca(
@@ -23,6 +27,7 @@ def run_vicca(
     text_threshold: float = 0.2,
     num_samples: int = 4,
     output_path: str = "CXRGen/test/samples/output/",
 ):
     """
     Top-level VICCA API used by app.py / Gradio.
@@ -87,6 +92,28 @@ def run_vicca(
             score = ssim(roi_org, roi_gen)
             ssim_scores.append(score)
     return {
         "boxes": boxes,
         "logits": logits,
@@ -95,4 +122,5 @@ def run_vicca(
         "shift_x": sx,
         "shift_y": sy,
         "best_generated_image_path": max_sim_gen_path,
     }

 import time
 import cv2
 import pandas as pd
+import torch
 from weights_utils import ensure_all_vicca_weights, get_weight
+from vg_token_attention import run_token_ca_visualization
 # Make sure all heavy weights are present once per container
 ensure_all_vicca_weights()
     cal_shift,
     get_local_bbox,
     extract_tensor,
+    chexbert_pathology,
 )
 def run_vicca(
     text_threshold: float = 0.2,
     num_samples: int = 4,
     output_path: str = "CXRGen/test/samples/output/",
+    attn_terms=None,
 ):
     """
     Top-level VICCA API used by app.py / Gradio.
             score = ssim(roi_org, roi_gen)
             ssim_scores.append(score)
+    # Optional: attention visualization for terms (e.g. from CheXbert)
+    attn_paths = None
+    attn_terms = chexbert_pathology(text_prompt)
+    if attn_terms:
+        cfg_path = "VG/config/GroundingDINO_SwinT_OGC_2.py"
+        vg_ckpt_path = get_weight("VG/weights/checkpoint0399_log4.pth")
+        attn_out_dir = os.path.join(output_path, "attn_overlays")
+        attn_paths = run_token_ca_visualization(
+            cfg_path=cfg_path,
+            ckpt_path=vg_ckpt_path,
+            image_path=image_path,     # or max_sim_gen_path if you prefer generated CXR
+            prompt=text_prompt,
+            terms=attn_terms,
+            out_dir=attn_out_dir,
+            device="cuda" if torch.cuda.is_available() else "cpu",
+            score_thresh=0.25,
+            topk=100,
+            term_agg="mean",
+            save_per_term=True,
+        )
     return {
         "boxes": boxes,
         "logits": logits,
         "shift_x": sx,
         "shift_y": sy,
         "best_generated_image_path": max_sim_gen_path,
+        "attention_overlays": attn_paths,
     }