{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "9pWjsoRu0NP1" ], "toc_visible": true, "gpuType": "T4", "authorship_tag": "ABX9TyNwUGOJJmdoUhSvbivS88h8", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "# Tiny test of recent text-to-music (TTM) models\n", "\n", "To run this notebook you need to do three things:\n", "1. Make sure the Colab runtime has a NVIDIA GPU available because CUDA is assumed.\n", "1. Request access to [Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) and create a corresponding [access token](https://huggingface.co/settings/tokens) to paste into the Hugging Face login screen below.\n", "1. Pray to the software dependency gods that the `pip` install below still works[.](https://nixos.org/)" ], "metadata": { "id": "BqyjOuyQl2Un" } }, { "cell_type": "markdown", "source": [ "## Setup" ], "metadata": { "id": "9pWjsoRu0NP1" } }, { "cell_type": "code", "source": [ "pip install diffusers transformers torchsde" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "E__6vJXkf6GE", "outputId": "563a70f6-7ecb-4b84-9308-5271c1ff1c4b" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: diffusers in /usr/local/lib/python3.10/dist-packages (0.30.2)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", "Requirement already satisfied: torchsde in /usr/local/lib/python3.10/dist-packages (0.2.6)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.10/dist-packages (from diffusers) (8.4.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from diffusers) (3.15.4)\n", "Requirement already satisfied: huggingface-hub>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from diffusers) (0.24.6)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from diffusers) (1.26.4)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from diffusers) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from diffusers) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from diffusers) (0.4.4)\n", "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from diffusers) (9.4.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n", "Requirement already satisfied: scipy>=1.5 in /usr/local/lib/python3.10/dist-packages (from torchsde) (1.13.1)\n", "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from torchsde) (2.4.0+cu121)\n", "Requirement already satisfied: trampoline>=0.1.2 in /usr/local/lib/python3.10/dist-packages (from torchsde) (0.1.2)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.2->diffusers) (2024.6.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.2->diffusers) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->torchsde) (1.13.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->torchsde) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->torchsde) (3.1.4)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata->diffusers) (3.20.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->diffusers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->diffusers) (3.8)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->diffusers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->diffusers) (2024.8.30)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->torchsde) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->torchsde) (1.3.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy\n", "import scipy\n", "import torch\n", "import pandas as pd\n", "import soundfile as sf\n", "import IPython.display as ipd" ], "metadata": { "id": "hE4uAUp7hH8j" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import login\n", "\n", "login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 359, "referenced_widgets": [ "b46758b315f24421afc14872a2828c7e", "e6d345ae656840d7b70a15a999b27020", "ad3929abedc84051baeeb9c13aa593bc", "9ad8999e8f6a4ea0b1a8a2372163f371", "086ef5823b174a198aff1797b1c7f8c6", "8fc8fa84f8054d008e7ff466bc5f7c62", "1b0db3a6526a4d6795356f1361cf8c0d", "b93bb18bb1c344b68049a200db7abfbc", "a59a510968e44ee19c780d5950ce56fe", "35003bb73f2341f593e38482fc1802bd", "e468da573da4422da44d4cd43cd3a958", "e6d83d6d97b04196b79ddf65368963ac", "fa517e7e214b4866ad951fc5a3df5c55", "c97d6f5c94a74eaeb6ec5aee3a555bf0", "9206a67e399a405ea60f3c3975ccf348", "3ca58b479dcd4f3f81869ef52070478b", "4883a5dd4b61450682f765ae6d8cf231" ] }, "id": "4yD38XImg_JL", "outputId": "e1ed8bf0-6cfc-40c4-827e-a8a1b610c72a" }, "execution_count": 3, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "markdown", "source": [ "### MusicGen" ], "metadata": { "id": "DshIv2OSljPh" } }, { "cell_type": "code", "source": [ "from transformers import AutoProcessor, MusicgenForConditionalGeneration\n", "\n", "processor = AutoProcessor.from_pretrained(\"facebook/musicgen-small\")\n", "musicgen = MusicgenForConditionalGeneration.from_pretrained(\"facebook/musicgen-small\")\n", "musicgen = musicgen.to(\"cuda\")\n", "\n", "inputs = processor(text=[prompt], padding=True, return_tensors=\"pt\").to(\"cuda\")\n", "audio = musicgen.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=1503)[0].T\n", "\n", "sf.write(\"musicgen.ogg\", audio.numpy(force=True), 32000)\n", "ipd.Audio(\"musicgen.ogg\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 184 }, "id": "amAokV8HgWyp", "outputId": "11377714-3b00-4882-a922-9435692a13c0" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", " WeightNorm.apply(module, name, dim)\n", "/usr/local/lib/python3.10/dist-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n", "`torch.nn.functional.scaled_dot_product_attention` does not support having an empty attention mask. Falling back to the manual attention implementation. This warning can be removed using the argument `attn_implementation=\"eager\"` when loading the model.Note that this probably happens because `guidance_scale>1` or because you used `get_unconditional_inputs`. See https://github.com/huggingface/transformers/issues/31189 for more information.\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "markdown", "source": [ "### Stable Audio Open" ], "metadata": { "id": "0PZQK1Rzlg-K" } }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 302, "referenced_widgets": [ "8665466a5a2a4e6c9be6d102a39ca995", "a7ee5f1e013d477da3e05ed43339532e", "3f50e04c8deb4aafb48ede0e7d737dd7", "1071d64ee174418b894aa819fbc36501", "0e511d259f5b4e3fa3aba98e20440edd", "6b8fe6af41d74a389068db086b169eb0", "373c45a4198046d19d2cb9b97c7af7a2", "0ed4b92f2349490cb433e84181aafce0", "1d245dea3b4742958408ea480ee26f8f", "f23675375a134a889ab66b9acd129223", "b936a942e40e423498466fa80317cbf3", "03c5d3c2c4324d3ba781f9c3783f0a85", "32f7505cede14794ad9afab8a59940eb", "4a5ecbc9172e4919b0cc6db00a46e33e", "778bde1a2daf488b88f32690b99cebc0", "6e2a72d9917a48eaa36c6b6ff677e94e", "89b3f8cbf2ff4e5f899a4d6af1df8ee0", "a0a352f3becd4d6aad2cc0d78c6b2213", "abe214179ed64573ac1cd79b3eda8f40", "a75d47c9c64f470e8d8a2aa56ea5307d", "9a9969635be6431ca2786594e7435a46", "2100f38f1a2c4c6cbaa0cf1611e591b5" ] }, "id": "Xdvcq7tOfkGe", "outputId": "e185c720-c9cf-4c82-dbb0-a71389fd30e6" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Loading pipeline components...: 0%| | 0/6 [00:00=t0 but got ta=0.29999998211860657 and t0=0.3.\n", " warnings.warn(f\"Should have ta>=t0 but got ta={ta} and t0={self._start}.\")\n", "/usr/local/lib/python3.10/dist-packages/torchsde/_brownian/brownian_interval.py:599: UserWarning: Should have ta>=t0 but got ta=0.0 and t0=0.3.\n", " warnings.warn(f\"Should have ta>=t0 but got ta={ta} and t0={self._start}.\")\n", "/usr/local/lib/python3.10/dist-packages/torchsde/_brownian/brownian_interval.py:602: UserWarning: Should have tb>=t0 but got tb=0.29999998211860657 and t0=0.3.\n", " warnings.warn(f\"Should have {tb_name}>=t0 but got {tb_name}={tb} and t0={self._start}.\")\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "from diffusers import StableAudioPipeline\n", "\n", "stableaudio = StableAudioPipeline.from_pretrained(\"stabilityai/stable-audio-open-1.0\", torch_dtype=torch.float16)\n", "stableaudio = stableaudio.to(\"cuda\")\n", "\n", "generator = torch.Generator(\"cuda\").manual_seed(0)\n", "\n", "audio = stableaudio(\n", " prompt,\n", " negative_prompt=\"Low quality.\",\n", " num_inference_steps=200,\n", " audio_end_in_s=30.0,\n", " num_waveforms_per_prompt=3,\n", " generator=generator,\n", ").audios\n", "\n", "output = audio[0].T.float().numpy(force=True)\n", "sf.write(\"stableaudio.ogg\", output, 44100)\n", "ipd.Audio(\"stableaudio.ogg\")" ] }, { "cell_type": "markdown", "source": [ "## Comparison table\n", "\n" ], "metadata": { "id": "vS37O57apvtw" } }, { "cell_type": "code", "source": [ "import base64\n", "\n", "\n", "def embed_audio(src):\n", " with open(src, \"rb\") as f:\n", " data = f.read()\n", " code = base64.b64encode(data).decode()\n", " html = f'