Final_Assignment_Template

Sleeping

File size: 4,267 Bytes

# tools/youtube_video_tool.py
import base64
import os
import re
import requests
import subprocess
import tempfile
from io import BytesIO
from tools.base_tool import BaseTool

import av
import yt_dlp
from tools.speech_recognition_tool import SpeechRecognitionTool

class YouTubeVideoTool(BaseTool):
    name = 'youtube_video'
    description = 'Process a YouTube video and answer questions based on content.'
    
    def __init__(
        self,
        speech_tool: SpeechRecognitionTool = None,
        quality: int = 360,
        frame_interval: float = 2.0,
        chunk_duration: float = 2.0,
        debug: bool = False,
    ):
        self.speech_tool = speech_tool
        self.quality = quality
        self.frame_interval = frame_interval
        self.chunk_duration = chunk_duration
        self.debug = debug

    def forward(self, url: str, query: str) -> str:
        video = self._download_video_info(url)
        captions = self._get_captions(video)
        title, description = video['title'], video['description']

        chunks = self._split_captions(captions)
        answer = ""

        for chunk in chunks:
            prompt = self._build_prompt(title, description, chunk, query, answer)
            response = self._mock_llm(prompt)  # replace with real call to your LLM
            answer = response.strip()

        return answer

    def _download_video_info(self, url: str):
        opts = {
            'quiet': True,
            'skip_download': True,
            'format': f'bestvideo[height<={self.quality}]+bestaudio/best',
        }
        with yt_dlp.YoutubeDL(opts) as ydl:
            return ydl.extract_info(url, download=False)

    def _get_captions(self, info: dict):
        lang = 'en'
        subs = info.get('subtitles', {}).get(lang) or info.get('automatic_captions', {}).get(lang)

        if subs:
            sub = next((s for s in subs if s['ext'] == 'vtt'), None)
            if sub:
                text = requests.get(sub['url']).text
                return self._parse_vtt(text)

        # fallback to Whisper-based transcription
        if self.speech_tool:
            audio_url = self._select_audio_format(info['formats'])
            audio = self._download_audio(audio_url)
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                f.write(audio.read())
                f.flush()
                transcription = self.speech_tool.forward(audio=f.name, with_time_markers=True)
            return self._parse_whisper_transcription(transcription)
        return []

    def _select_audio_format(self, formats):
        audio_only = [f for f in formats if f.get('vcodec') == 'none']
        audio_only.sort(key=lambda f: f.get('abr', 0), reverse=True)
        return audio_only[0]['url']

    def _download_audio(self, audio_url: str) -> BytesIO:
        cmd = ["ffmpeg", "-i", audio_url, "-f", "wav", "-ac", "1", "-ar", "16000", "-"]
        proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
        return BytesIO(proc.stdout)

    def _parse_vtt(self, vtt_data: str):
        segments = []
        entries = re.findall(r'(\d+:\d+:\d+\.\d+ --> \d+:\d+:\d+\.\d+)(.*?)\n(?=\n|\d)', vtt_data, re.DOTALL)
        for (time_range, text) in entries:
            clean_text = re.sub(r'<.*?>', '', text).strip().replace("\n", " ")
            segments.append({"text": clean_text})
        return segments

    def _parse_whisper_transcription(self, text: str):
        pattern = re.compile(r'\[(\d+\.\d+)]\n(.+?)\n\[(\d+\.\d+)]')
        return [{"text": match[1]} for match in pattern.findall(text)]

    def _split_captions(self, captions):
        # Simple fixed-length chunking
        return [
            {"text": " ".join([c["text"] for c in captions[i:i+3]])}
            for i in range(0, len(captions), 3)
        ]

    def _build_prompt(self, title, desc, chunk, query, prev):
        base = f"""
Video Title: {title}
Video Description: {desc}
Transcript:
{chunk['text']}
"""
        if prev:
            base += f"\nPrevious answer: {prev}\n"
        base += f"Question: {query}"
        return base.strip()

    def _mock_llm(self, prompt: str):
        # Replace this with call to your real LLM
        return "I need to keep watching."