Spaces:

apple
/

fastvlm-webgpu

Running

I can't run it on Electron

by shadowT - opened 6 days ago

6 days ago

An error occurred during model execution: "Error: invalid data location: undefined for input "input_ids"

export async function inferenceVision(
prompt: string,
inferenceLock: React.RefObject,
processorRef: React.RefObject<LlavaProcessor | null>,
modelRef: React.RefObject<PreTrainedModel | null>,
videoRef: React.RefObject<HTMLVideoElement | null>,
canvasRef: React.RefObject<HTMLCanvasElement | null>,
onTextUpdate?: (text: string) => void
): Promise {
if (inferenceLock.current) {
console.log('Inference already running, skipping frame')
return '' // Return empty string to signal a skip
}
inferenceLock.current = true
if (!processorRef.current || !modelRef.current) {
throw new Error('Model/processor not loaded')
}
if (!canvasRef.current) {
canvasRef.current = document.createElement('canvas')
}
if (!videoRef.current) {
videoRef.current = document.createElement('video')
}
const canvas = canvasRef.current
const video = videoRef.current

const stream = await navigator.mediaDevices.getDisplayMedia({
audio: true,
video: {
width: 1024,
height: 1024,
frameRate: 30
}
})
video.srcObject = stream
video.playsInline = true
video.muted = true
await video.play()

await new Promise((resolve) => {
if (video.readyState >= 2) resolve(null)
video.onloadeddata = resolve
})

if (video.videoWidth === 0 || video.videoHeight === 0) {
inferenceLock.current = false
throw new Error('Video frame not ready')
}

canvas.width = video.videoWidth
canvas.height = video.videoHeight
const ctx = canvas.getContext('2d', { willReadFrequently: true })
if (!ctx) throw new Error('Could not get canvas context')
ctx.drawImage(video, 0, 0)
const frame = ctx.getImageData(0, 0, canvas.width, canvas.height)
console.log('Is valid typed array:', frame.data instanceof Uint8ClampedArray)

const rawImg = new RawImage(frame.data, frame.width, frame.height, 4)
console.log(prompt)

const messages: Message[] = [
{
role: 'system',
content: You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.
},
{ role: 'user', content: <image>${prompt} }
]
const _messages = processorRef.current.apply_chat_template(messages, {
add_generation_prompt: true
})
console.log(rawImg, _messages)

const inputs = await processorRef.current(rawImg, _messages, {
add_special_tokens: false
})
console.log(inputs)

let streamed = ''
const streamer = new TextStreamer(processorRef.current.tokenizer!, {
skip_prompt: true,
skip_special_tokens: true,
callback_function: (t: string) => {
streamed += t
onTextUpdate?.(streamed.trim())
}
})
const outputs = (await modelRef.current.generate({
...inputs,
max_new_tokens: 512,
do_sample: false,
streamer,
repetition_penalty: 1.2
})) as Tensor
const decoded = processorRef.current.batch_decode(
outputs.slice(null, [inputs.input_ids.dims?.at(-1), null]),
{
skip_special_tokens: true
}
)
inferenceLock.current = false
return decoded[0].trim()
}

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment