_I='./mixed_data/' _H='text' _G='uploader' _F='title' _E=False _D='./video_data/' _C='value' _B='label' _A=True import gradio as gr from moviepy import VideoFileClip from pathlib import Path import speech_recognition as sr from PIL import Image import os,shutil,json,matplotlib.pyplot as plt,yt_dlp,requests,base64 from io import BytesIO def plot_images(image_paths): A=0;plt.figure(figsize=(16,9));B=[] for C in image_paths: if os.path.isfile(C): B.append(C);A+=1 if A>=7:break return B def download_video(video_url,output_video_path=_D): B=output_video_path;D={'format':'bestvideo+bestaudio/best','merge_output_format':'mp4','outtmpl':f"{B}/input_vid.mp4",'noplaylist':_A,'quiet':_E};Path(B).mkdir(parents=_A,exist_ok=_A) with yt_dlp.YoutubeDL(D)as C:A=C.extract_info(video_url,download=_A);A=C.sanitize_info(A);return{_F:A.get(_F),_G:A.get(_G),'views':A.get('view_count')} def video_to_images(video_path,output_folder):A=output_folder;Path(A).mkdir(parents=_A,exist_ok=_A);B=VideoFileClip(video_path);B.write_images_sequence(os.path.join(A,'frame%04d.png'),fps=.2) def video_to_audio(video_path,output_audio_path):A=VideoFileClip(video_path);B=A.audio;B.write_audiofile(output_audio_path) def audio_to_text(audio_path): A=sr.Recognizer() try: with sr.AudioFile(audio_path)as B:C=A.record(B);D=A.recognize_google(C);return D except sr.UnknownValueError:print('Google Speech Recognition could not understand the audio.') except sr.RequestError as E:print(f"Could not request results: {E}") def prepare_all_videos(video_folder=_D,output_folder=_I): '\n Processes all video files in video_folder, extracting images and text for each,\n and stores them in unique subfolders under output_folder.\n Returns a list of metadata dicts for all videos.\n ';J='unknown';F=output_folder;E=video_folder;Path(F).mkdir(parents=_A,exist_ok=_A);K=[A for A in os.listdir(E)if A.lower().endswith(('.mp4','.mov','.avi','.mkv'))];G=[] for B in K: H=os.path.join(E,B);I=Path(B).stem;A=os.path.join(F,I);Path(A).mkdir(parents=_A,exist_ok=_A);C=os.path.join(A,'output_audio.wav');video_to_images(H,A);video_to_audio(H,C);D=audio_to_text(C);L=os.path.join(A,'output_text.txt') with open(L,'w')as M:M.write(D if D else'') os.remove(C);N={_F:I,_G:J,'views':J,'file':B};G.append({'meta':N,_H:D,'folder':A}) return G from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core import SimpleDirectoryReader,StorageContext from llama_index.vector_stores.lancedb import LanceDBVectorStore from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import Settings def create_vector_db_for_all(image_txt_root_folder): '\n Loads all subfolders in image_txt_root_folder as documents for the vector DB.\n ';C='lancedb';D=LanceDBVectorStore(uri=C,table_name='text_collection');E=LanceDBVectorStore(uri=C,table_name='image_collection');F=StorageContext.from_defaults(vector_store=D,image_store=E);Settings.embed_model=HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2');A=[] for B in Path(image_txt_root_folder).iterdir(): if B.is_dir():A.extend(SimpleDirectoryReader(str(B)).load_data()) G=MultiModalVectorStoreIndex.from_documents(A,storage_context=F);H=G.as_retriever(similarity_top_k=2,image_similarity_top_k=3);return H from llama_index.core.schema import ImageNode def retrieve(retriever_engine,query_str): D=retriever_engine.retrieve(query_str);B=[];C=[] for A in D: if isinstance(A.node,ImageNode):B.append(A.node.metadata['file_path']) else:C.append(A.text) return B,C qa_tmpl_str='Given the provided information, including relevant images and retrieved context from the video, accurately and precisely answer the query without any additional prior knowledge.\nPlease ensure honesty and responsibility, refraining from any racist or sexist remarks.\n---------------------\nContext: {context_str}\nMetadata for video: {metadata_str} \n---------------------\nQuery: {query_str}\nAnswer: ' available_models=[{_C:'openai/gpt-oss-20b:free',_B:'Llama'},{_C:'qwen/qwen2.5-vl-72b-instruct:free',_B:'Qwen'},{_C:'google/gemma-3-27b-it:free',_B:'Gemma'},{_C:'moonshotai/kimi-vl-a3b-thinking:free',_B:'Kimi'},{_C:'google/gemini-2.0-flash-exp:free',_B:'Gemini'}] model_value_to_label={A[_C]:A[_B]for A in available_models} model_label_to_value={A[_B]:A[_C]for A in available_models} def gradio_chat(query,model_label): K='image_url';J='type';D=query;C='content';L=_D;E=_I try: M=prepare_all_videos(L,E);N=json.dumps([A['meta']for A in M]);O=create_vector_db_for_all(E);P,Q=retrieve(retriever_engine=O,query_str=D);R=''.join(Q);S=qa_tmpl_str.format(context_str=R,query_str=D,metadata_str=N);T=os.environ['API_KEY'];U={'Authorization':f"Bearer {T}",'Content-Type':'application/json','HTTP-Referer':'','X-Title':''};V=model_label_to_value.get(model_label,available_models[0][_C]);F=[{'role':'user',C:[{J:_H,_H:S}]}];G=[] for A in P: try:W=Image.open(A);H=BytesIO();W.save(H,format='JPEG');X=base64.b64encode(H.getvalue()).decode('utf-8');F[0][C].append({J:K,K:{'url':f"data:image/jpeg;base64,{X}"}});G.append(A) except Exception as B:print(f"Error loading image {A}: {B}") Y={'model':V,'messages':F};I=requests.post(url='https://openrouter.ai/api/v1/chat/completions',headers=U,data=json.dumps(Y));I.raise_for_status();Z=I.json()['choices'][0]['message'][C];return Z,G except Exception as B:return f"Error: {str(B)}",[] gradio_ui=gr.Interface(fn=gradio_chat,inputs=[gr.Textbox(label='',placeholder='Try: Best island in Maldives'),gr.Dropdown(choices=[A[_B]for A in available_models],value=available_models[0][_B],label='Select Model:')],outputs=[gr.Textbox(label='Vega Response:'),gr.Gallery(label='Relevant Images',allow_preview=_A)],title='',description='',theme=gr.themes.Default(primary_hue='sky'),css='footer {visibility: hidden}',deep_link=_E) if __name__=='__main__':gradio_ui.launch(share=_E)