{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8locV9sOyhkn", "outputId": "4842cac9-3a7f-4fc5-edeb-3ca5452fec53" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fri Jun 13 10:31:53 2025 \n", "+-----------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n", "|-----------------------------------------+------------------------+----------------------+\n", "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|=========================================+========================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 49C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n", "| | | N/A |\n", "+-----------------------------------------+------------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=========================================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NnO-D2woRdMR", "outputId": "d6ca31d7-269e-4995-991d-7b515d9e6154" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'Voice2VoiceTranslate'...\n", "remote: Enumerating objects: 88, done.\u001b[K\n", "remote: Counting objects: 100% (88/88), done.\u001b[K\n", "remote: Compressing objects: 100% (74/74), done.\u001b[K\n", "remote: Total 88 (delta 39), reused 39 (delta 12), pack-reused 0 (from 0)\u001b[K\n", "Receiving objects: 100% (88/88), 7.91 MiB | 11.15 MiB/s, done.\n", "Resolving deltas: 100% (39/39), done.\n", "/content/Voice2VoiceTranslate/Voice2VoiceTranslate/Voice2VoiceTranslate\n" ] } ], "source": [ "!git clone https://github.com/rohitptnk/Voice2VoiceTranslate.git\n", "%cd Voice2VoiceTranslate" ] }, { "cell_type": "markdown", "metadata": { "id": "UCVhY-RBkP_Q" }, "source": [ "# Install and Import" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, "id": "poBUySdreSA9", "outputId": "575cfee3-ff94-4880-b5c3-6f3df8caa0e6" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ "!pip install --quiet -r requirements.txt" ] }, { "cell_type": "markdown", "metadata": { "id": "BieMqbzN2GnY" }, "source": [ "# Using each function separately" ] }, { "cell_type": "markdown", "metadata": { "id": "HBzVTrKIjCFz" }, "source": [ "## Convert Speech to Text using Whisper" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QIlmE2rffVCc", "outputId": "1c313b3e-a79b-4cc3-f9e8-6a55f891526f" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 20.9MiB/s]\n" ] } ], "source": [ "# Transcribe audio\n", "from my_transcribe import transcribe_audio_locally\n", "audio_file = \"Input Audio Sample.wav\" # Supports many audio formats\n", "result = transcribe_audio_locally(audio_file, \"base\") # Using base model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "czqyT0rziVZz", "outputId": "fdac7ad1-b51d-4ab4-d556-8477494206e4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n" ] } ], "source": [ "text = result[\"text\"]\n", "print(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mSsUsAS-UdSw", "outputId": "5033abbe-4d7f-42bd-ffcd-7f2290690fd2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved to transcribed_text.txt\n" ] } ], "source": [ "# Save to a text file\n", "with open(\"transcribed_text.txt\", \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)\n", "\n", "print(\"Saved to transcribed_text.txt\")" ] }, { "cell_type": "markdown", "metadata": { "id": "V7ipQTRbjPXq" }, "source": [ "## Translate text-to-text using Argos Translate" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fR_Q7Bb8w2ho" }, "outputs": [], "source": [ "# Read the transcribed text from the file\n", "with open(\"transcribed_text.txt\", \"r\", encoding=\"utf-8\") as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ksXnYJNTkSji", "outputId": "7e5ef285-d3aa-4a0b-89f5-0f0a81efda8f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "English: Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n", "Hindi: क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n" ] } ], "source": [ "# Translate the text to Hindi\n", "from my_translate import translate_text\n", "\n", "hindi_translation = translate_text(text, \"en\", \"hi\")\n", "print(\"English:\", text)\n", "print(\"Hindi:\", hindi_translation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f6YatTXjT5EH", "outputId": "459cc759-e54e-4376-eae5-ba6dba2a67e7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved to hindi_translation.txt\n" ] } ], "source": [ "# Save to a text file\n", "with open(\"hindi_translation.txt\", \"w\", encoding=\"utf-8\") as f:\n", " f.write(hindi_translation)\n", "\n", "print(\"Saved to hindi_translation.txt\")" ] }, { "cell_type": "markdown", "metadata": { "id": "1BECaz-clOJB" }, "source": [ "## Text to Speech using Suno-Bark" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 55 }, "id": "Qho3YgeEZHJx", "outputId": "707a7092-cc95-4680-91d5-44367fd7b092" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n" ] } ], "source": [ "# Read the Hindi translation from the file\n", "with open(\"hindi_translation.txt\", \"r\", encoding=\"utf-8\") as f:\n", " hindi_translation = f.read()\n", " print(hindi_translation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9WnxxN8oOGk0", "outputId": "6d028afd-25ca-40e4-ca61-888e066fd6a1" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\rohit\\OneDrive\\Desktop\\Files\\Vaanee\\Audio-Video Data\\Individual Audio\\Prepared\\V2V\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\rohit\\.cache\\huggingface\\hub\\models--suno--bark-small. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "c:\\Users\\rohit\\OneDrive\\Desktop\\Files\\Vaanee\\Audio-Video Data\\Individual Audio\\Prepared\\V2V\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\rohit\\.cache\\huggingface\\hub\\models--suno--bark. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "c:\\Users\\rohit\\OneDrive\\Desktop\\Files\\Vaanee\\Audio-Video Data\\Individual Audio\\Prepared\\V2V\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\rohit\\.cache\\huggingface\\hub\\models--ylacombe--bark-large. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" ] } ], "source": [ "# Convert the Hindi text to speech\n", "from my_tts import text_to_speech\n", "output_path = text_to_speech(hindi_translation, \"v2/hi_speaker_2\")" ] }, { "cell_type": "markdown", "metadata": { "id": "imIkrAB62Gnd" }, "source": [ "# Using App" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "X6yv9liw2Gnd" }, "outputs": [], "source": [ "from app import voice_to_voice\n", "\n", "input_path = \"Input Audio Sample.wav\"\n", "output_path = voice_to_voice(input_path)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "JczUoimK2Gnd", "outputId": "2ca1947a-3597-4924-ae4c-b16f6ee612be", "colab": { "base_uri": "https://localhost:8080/", "height": 75 } }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 11 } ], "source": [ "from IPython.display import Audio\n", "\n", "Audio(output_path)\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [], "include_colab_link": true }, "kernelspec": { "display_name": "vtv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 0 }