diff --git a/PulumiWebServer/.gitignore b/PulumiWebServer/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/PulumiWebServer/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/PulumiWebServer/Domain.fs b/PulumiWebServer/Domain.fs index b463e4b..86a0ed7 100644 --- a/PulumiWebServer/Domain.fs +++ b/PulumiWebServer/Domain.fs @@ -102,6 +102,7 @@ type WellKnownSubdomain = | WoodpeckerAgent | Grafana | PureGym + | Whisper override this.ToString () = match this with @@ -113,6 +114,7 @@ type WellKnownSubdomain = | Woodpecker -> "woodpecker" | WoodpeckerAgent -> "woodpecker-agent" | PureGym -> "puregym" + | Whisper -> "whisper" static member Parse (s : string) = match s with @@ -124,6 +126,7 @@ type WellKnownSubdomain = | "woodpecker-agent" -> WellKnownSubdomain.WoodpeckerAgent | "grafana" -> WellKnownSubdomain.Grafana | "puregym" -> WellKnownSubdomain.PureGym + | "whisper" -> WellKnownSubdomain.Whisper | _ -> failwith $"Failed to deserialise: {s}" diff --git a/PulumiWebServer/Nix/configuration.nix b/PulumiWebServer/Nix/configuration.nix index 33e73b4..472c985 100644 --- a/PulumiWebServer/Nix/configuration.nix +++ b/PulumiWebServer/Nix/configuration.nix @@ -2,6 +2,7 @@ nixpkgs, website, puregym-client, + whisper-packages, ... }: let lib = nixpkgs.lib; @@ -24,6 +25,7 @@ in { # generated at runtime by nixos-infect and copied here ./hardware-configuration.nix ./networking.nix + ./whisper/whisper.nix ]; services.radicale-config.domain = userConfig.domain; @@ -47,6 +49,8 @@ in { services.prometheus-config.domain-exporter-domains = [userConfig.domain]; services.puregym-config.domain = userConfig.domain; services.puregym-config.subdomain = "puregym"; + services.whisper-config.domain = userConfig.domain; + services.whisper-config.subdomain = "whisper"; services.journald.extraConfig = "SystemMaxUse=100M"; diff --git a/PulumiWebServer/Nix/flake.lock b/PulumiWebServer/Nix/flake.lock index ec2849e..c719447 100644 --- a/PulumiWebServer/Nix/flake.lock +++ b/PulumiWebServer/Nix/flake.lock @@ -99,6 +99,24 @@ "inputs": { "systems": "systems_3" }, + "locked": { + "lastModified": 1701680307, + "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "4022d587cbbfd70fe950c1e2083a02621806a725", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "flake-utils_4": { + "inputs": { + "systems": "systems_4" + }, "locked": { "lastModified": 1694529238, "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", @@ -113,6 +131,24 @@ "type": "github" } }, + "flake-utils_5": { + "inputs": { + "systems": "systems_5" + }, + "locked": { + "lastModified": 1701680307, + "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "4022d587cbbfd70fe950c1e2083a02621806a725", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, "home-manager": { "inputs": { "nixpkgs": [ @@ -120,11 +156,11 @@ ] }, "locked": { - "lastModified": 1703795120, - "narHash": "sha256-Scr4fwfGn03zwFgM7IltT8hqbFDkHvymnF5AaR4eDAg=", + "lastModified": 1705660020, + "narHash": "sha256-1tOuNh+UbiZlaC8RrpQzzypgnLBC67eRlBunfkE4sbQ=", "owner": "nix-community", "repo": "home-manager", - "rev": "ba6b75011b44e85b1b755b6c423f85d0817645f7", + "rev": "2064348e555b6aa963da6372a8f14e6acb80a176", "type": "github" }, "original": { @@ -139,17 +175,15 @@ "website", "flake-utils" ], - "nixpkgs": [ - "website", - "nixpkgs" - ], + "nixpkgs": "nixpkgs_5", "scripts": "scripts_2" }, "locked": { - "dirtyRev": "9e2f5603f1e4e263e73ae0d0ca7c86ae14427c73-dirty", - "dirtyShortRev": "9e2f560-dirty", - "lastModified": 1701513782, - "narHash": "sha256-dDym75Eq6TIw9IrokBWwSoto0/l3nxFGpH4/VZkeqrQ=", + "lastModified": 1704152342, + "narHash": "sha256-9ntmhbkkmZSoaVMYPmZ/HkzYphpIHIBrWv5viO2Ee48=", + "ref": "refs/heads/main", + "rev": "882c5d5703e639a1318ea4e69f3b8cbbfacfb3a0", + "revCount": 19, "type": "git", "url": "file:/Users/patrick/Desktop/website/static-site-images" }, @@ -160,21 +194,15 @@ }, "katex": { "inputs": { - "flake-utils": [ - "website", - "flake-utils" - ], - "nixpkgs": [ - "website", - "nixpkgs" - ] + "flake-utils": "flake-utils_4", + "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1696151934, - "narHash": "sha256-8kihcqdgYjoVuGozfgfcWh81yqMUvns4+C/fgkn+RNQ=", + "lastModified": 1704150937, + "narHash": "sha256-G6uJKkY5VErgobe51IIbp/ugHDIhVx5e0xNjJ90JEOk=", "owner": "Smaug123", "repo": "KaTeX", - "rev": "ac1f9b30441f63ea20216a36ffa7148dc0e9a9b3", + "rev": "b74ed701beec2bebd161a0b5ea30c496c5206b96", "type": "github" }, "original": { @@ -184,13 +212,25 @@ "type": "github" } }, + "model": { + "flake": false, + "locked": { + "narHash": "sha256-CSrDFPoQgkvBW36keGkYpjxz740TBrCKVSxwSnfYvV8=", + "type": "file", + "url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin?download=true" + }, + "original": { + "type": "file", + "url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin?download=true" + } + }, "nixpkgs": { "locked": { - "lastModified": 1703467016, - "narHash": "sha256-/5A/dNPhbQx/Oa2d+Get174eNI3LERQ7u6WTWOlR1eQ=", + "lastModified": 1705458851, + "narHash": "sha256-uQvEhiv33Zj/Pv364dTvnpPwFSptRZgVedDzoM+HqVg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d02d818f22c777aa4e854efc3242ec451e5d462a", + "rev": "8bf65f17d8070a0a490daf5f1c784b87ee73982c", "type": "github" }, "original": { @@ -202,11 +242,11 @@ }, "nixpkgs-stable": { "locked": { - "lastModified": 1703351344, - "narHash": "sha256-9FEelzftkE9UaJ5nqxidaJJPEhe9TPhbypLHmc2Mysc=", + "lastModified": 1705033721, + "narHash": "sha256-K5eJHmL1/kev6WuqyqqbS1cdNnSidIZ3jeqJ7GbrYnQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "7790e078f8979a9fcd543f9a47427eeaba38f268", + "rev": "a1982c92d8980a0114372973cbdfe0a307f1bdea", "type": "github" }, "original": { @@ -233,11 +273,11 @@ }, "nixpkgs_3": { "locked": { - "lastModified": 1703134684, - "narHash": "sha256-SQmng1EnBFLzS7WSRyPM9HgmZP2kLJcPAz+Ug/nug6o=", + "lastModified": 1704842529, + "narHash": "sha256-OTeQA+F8d/Evad33JMfuXC89VMetQbsU4qcaePchGr4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d6863cbcbbb80e71cecfc03356db1cda38919523", + "rev": "eabe8d3eface69f5bb16c18f8662a702f50c20d5", "type": "github" }, "original": { @@ -262,6 +302,52 @@ "type": "github" } }, + "nixpkgs_5": { + "locked": { + "lastModified": 1704150997, + "narHash": "sha256-HbBTRybyqmd2/OMHIA6bV8HNXpcwB/t49be2kBq13IE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2df2bddf079263a6da2eb1876c7e212188ff950c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_6": { + "locked": { + "lastModified": 1688392541, + "narHash": "sha256-lHrKvEkCPTUO+7tPfjIcb7Trk6k31rz18vkyqmkeJfY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "ea4c80b39be4c09702b0cb3b42eab59e2ba4f24b", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-22.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_7": { + "locked": { + "lastModified": 1695033101, + "narHash": "sha256-RQ4m+ycjdLdass7Hr4+Lzwnjw7wGhcUkKqWiJS3YxPM=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "d941d9491804e0ca01e03468dbf6f8d3a7919a16", + "type": "github" + }, + "original": { + "owner": "nixos", + "repo": "nixpkgs", + "type": "github" + } + }, "pdfs": { "inputs": { "flake-utils": [ @@ -294,11 +380,11 @@ "nixpkgs": "nixpkgs_2" }, "locked": { - "lastModified": 1703797686, - "narHash": "sha256-4HZ+uz7LFK+44IzKuLe9lL34Oau/J1Tppmxpe+x5FCw=", + "lastModified": 1703937582, + "narHash": "sha256-M4y/xbrocPoLwG4qUUdCoBvOHumPAGlMoeo8SpWjn0M=", "ref": "refs/heads/main", - "rev": "8ece87ff57b0ae66f38120d8a26b33661625fa61", - "revCount": 5, + "rev": "cdbc73b07f3cac88e446fbe73c4b0c6616448319", + "revCount": 8, "type": "git", "url": "https://gitea.patrickstevens.co.uk/patrick/puregym-unofficial-dotnet" }, @@ -313,7 +399,8 @@ "nixpkgs": "nixpkgs", "puregym-client": "puregym-client", "sops": "sops", - "website": "website" + "website": "website", + "whisper-packages": "whisper-packages" } }, "scripts": { @@ -382,11 +469,11 @@ "nixpkgs-stable": "nixpkgs-stable" }, "locked": { - "lastModified": 1703387502, - "narHash": "sha256-JnWuQmyanPtF8c5yAEFXVWzaIlMxA3EAZCh8XNvnVqE=", + "lastModified": 1705356877, + "narHash": "sha256-274jL1cH64DcXUXebVMZBRUsTs3FvFlPIPkCN/yhSnI=", "owner": "Mic92", "repo": "sops-nix", - "rev": "e523e89763ff45f0a6cf15bcb1092636b1da9ed3", + "rev": "87755331580fdf23df7e39b46d63ac88236bf42c", "type": "github" }, "original": { @@ -440,6 +527,36 @@ "type": "github" } }, + "systems_4": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_5": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, "website": { "inputs": { "anki-decks": "anki-decks", @@ -454,11 +571,11 @@ "scripts": "scripts_4" }, "locked": { - "lastModified": 1701514896, - "narHash": "sha256-XDhco86dHsoHzezarG1UQBpsCyZ+AqRY+w+l3g4hL1o=", + "lastModified": 1705693021, + "narHash": "sha256-Ew7yxjYvwG9IE6YNPxcHNQUn2T8es1yWoK/f+x7UyCM=", "owner": "Smaug123", "repo": "static-site-pipeline", - "rev": "b35c219d0e3e93b5bbd52befa486b54fa4e8b710", + "rev": "da3a4d2e53e3068755fcd49b74bcfcf2800c197d", "type": "github" }, "original": { @@ -466,6 +583,27 @@ "repo": "static-site-pipeline", "type": "github" } + }, + "whisper-packages": { + "inputs": { + "flake-utils": "flake-utils_5", + "model": "model", + "nixpkgs": "nixpkgs_7" + }, + "locked": { + "lastModified": 1704121990, + "narHash": "sha256-ss1gDu3C7anyqEo0ksDuOPgcsD3EOUqtCwAt8ei6rM4=", + "owner": "Smaug123", + "repo": "whisper.cpp", + "rev": "2ff0257983b4a3fe9cece30b215ba2b5087f2613", + "type": "github" + }, + "original": { + "owner": "Smaug123", + "ref": "nix-small", + "repo": "whisper.cpp", + "type": "github" + } } }, "root": "root", diff --git a/PulumiWebServer/Nix/flake.nix b/PulumiWebServer/Nix/flake.nix index 1b3019a..eabb610 100644 --- a/PulumiWebServer/Nix/flake.nix +++ b/PulumiWebServer/Nix/flake.nix @@ -13,6 +13,7 @@ inputs.nixpkgs.follows = "nixpkgs"; }; sops.url = "github:Mic92/sops-nix"; + whisper-packages.url = "github:Smaug123/whisper.cpp/nix-small"; }; outputs = { @@ -22,6 +23,7 @@ home-manager, website, puregym-client, + whisper-packages, } @ inputs: let system = "x86_64-linux"; in { @@ -31,6 +33,7 @@ inherit system; website = website.packages.${system}.default; puregym-client = puregym-client.packages.${system}.default; + whisper-packages = whisper-packages.packages.${system}; }; modules = [ (import ./configuration.nix (inputs // {inherit inputs;})) diff --git a/PulumiWebServer/Nix/whisper/requirements.txt b/PulumiWebServer/Nix/whisper/requirements.txt new file mode 100644 index 0000000..4319354 --- /dev/null +++ b/PulumiWebServer/Nix/whisper/requirements.txt @@ -0,0 +1,2 @@ +flask +waitress diff --git a/PulumiWebServer/Nix/whisper/transcribe.html b/PulumiWebServer/Nix/whisper/transcribe.html new file mode 100644 index 0000000..c2097e6 --- /dev/null +++ b/PulumiWebServer/Nix/whisper/transcribe.html @@ -0,0 +1,107 @@ + + + + Whisper Transcription + + + +

Whisper Transcription

+

Submit file for transcription

+
+ + +
+ +
+ +
{no file set}
+
+ + + +
+ +
+
+ + + + diff --git a/PulumiWebServer/Nix/whisper/whisper.nix b/PulumiWebServer/Nix/whisper/whisper.nix new file mode 100644 index 0000000..a4dc83a --- /dev/null +++ b/PulumiWebServer/Nix/whisper/whisper.nix @@ -0,0 +1,65 @@ +{ + config, + pkgs, + lib, + whisper-packages, + ... +}: { + options = { + services.whisper-config = { + domain = lib.mkOption { + type = lib.types.str; + example = "example.com"; + description = lib.mdDoc "Top-level domain to configure"; + }; + subdomain = lib.mkOption { + type = lib.types.str; + example = "whisper"; + description = lib.mdDoc "Subdomain in which to put the Whisper server"; + }; + port = lib.mkOption { + type = lib.types.port; + description = lib.mdDoc "Whisper localhost port to be forwarded"; + default = 1739; + }; + }; + }; + + config = { + users.users."whisper".extraGroups = [config.users.groups.keys.name]; + users.users."whisper".group = "whisper"; + users.groups.whisper = {}; + users.users."whisper".isSystemUser = true; + + systemd.services.whisper-server = { + description = "whisper-server"; + wantedBy = ["multi-user.target"]; + serviceConfig = let + python = pkgs.python3.withPackages (p: with p; [flask waitress]); + in { + Restart = "always"; + Type = "exec"; + User = "whisper"; + Group = "whisper"; + ExecStart = "${python}/bin/python ${./whisper.py}"; + }; + environment = { + WHISPER_NORMALIZE = "${whisper-packages.normalize}/bin/normalize.sh"; + WHISPER_CLIENT = "${whisper-packages.default}/bin/whisper-cpp"; + WHISPER_PORT = toString config.services.whisper-config.port; + INDEX_PAGE_PATH = ./transcribe.html; + YT_DLP = "${pkgs.yt-dlp}/bin/yt-dlp"; + }; + }; + + services.nginx.proxyTimeout = "300s"; + services.nginx.clientMaxBodySize = "50M"; + services.nginx.virtualHosts."${config.services.whisper-config.subdomain}.${config.services.whisper-config.domain}" = { + forceSSL = true; + enableACME = true; + locations."/" = { + proxyPass = "http://localhost:${toString config.services.whisper-config.port}/"; + }; + }; + }; +} diff --git a/PulumiWebServer/Nix/whisper/whisper.py b/PulumiWebServer/Nix/whisper/whisper.py new file mode 100644 index 0000000..6d49b85 --- /dev/null +++ b/PulumiWebServer/Nix/whisper/whisper.py @@ -0,0 +1,129 @@ +import subprocess +import os +from typing import AnyStr +import re +from flask import Flask, Response, request, render_template_string +import waitress +import tempfile + +app = Flask(__name__) + +youtube_regex = re.compile( + r"^(?:https?://)?(?:www\.)?(?:youtu\.be/|youtube\.com/(?:embed/|v/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$") + +alnum_regex = re.compile(r"^[a-zA-Z0-9]+$") + + +def generate_output(wav_file): + process = subprocess.Popen([whisper, "--file", f"/tmp/whisper/{wav_file}.wav", "--output-txt"], + stdout=subprocess.PIPE, bufsize=1, + text=True) + + yield f'event: started\ndata: {wav_file}\n\n' + + for line in iter(process.stdout.readline, ''): + yield f"data: {line}\n\n" + + yield 'event: quit\ndata: \n\n' + + os.remove(f"/tmp/whisper/{wav_file}.wav") + + +def obtain_youtube(url: AnyStr) -> str: + # handle, temp_file = tempfile.mkstemp(".wav", text=False) + # os.close(handle) + # os.remove(temp_file) + + # output = subprocess.run( + # [ytdlp, '--extract-audio', '--audio-format', 'wav', '--cookies', '/tmp/cookies.txt', '--audio-quality', '16k', '--force-ipv6', '--output', temp_file, + # url], check=True, capture_output=True, text=True) + # if "429 Too Many Requests" in output.stdout: + # raise subprocess.CalledProcessError(1, whisper, "YouTube replied saying Too Many Requests") + # return temp_file + + raise Exception("DigitalOcean is rate limited to YouTube") + + +def normalize(path: str, output: str): + try: + subprocess.run([normalize_binary, path, output], check=True) + except subprocess.CalledProcessError: + os.remove(path) + return Response("failed to normalize", status=500) + + +@app.route('/transcribe-youtube') +def transcribe_youtube(): + try: + url = request.args.get('url') + except KeyError: + return Response("must have a URL in the format ?url=https://www.youtube.com/watch?v=...", status=400) + if youtube_regex.match(url) is None: + return Response(f"url '{url}' did not appear to be a YouTube video", status=400) + wav_file = obtain_youtube(url) + return Response(generate_output(wav_file), mimetype="text/event-stream") + + +@app.route('/transcribe-file') +def transcribe_file(): + try: + file = request.args.get('file') + except KeyError: + return Response("must have a file as obtained from /upload, in the format ?file=...", status=400) + if alnum_regex.match(file) is None: + return Response(f"filename '{file}' was not alphanumeric", status=400) + return Response(generate_output(file), mimetype="text/event-stream") + + +@app.route('/transcribe-ui') +def index(): + return render_template_string(open(index_page_path).read()) # Assuming 'index.html' is in the same directory + + +@app.route('/upload', methods=["POST"]) +def upload(): + if 'file' not in request.files: + return 'No "file" part in request', 400 + file = request.files['file'] + + # Create temp file for this upload + handle, temp_file = tempfile.mkstemp(text=False) + try: + os.close(handle) + file.save(temp_file) + # get filename from absolute path + temp_file_frag = os.path.basename(temp_file) + + normalize(temp_file, f"/tmp/whisper/{temp_file_frag}") + finally: + try: + os.remove(temp_file) + finally: + pass + + return Response(temp_file_frag, mimetype="text/plain") + + +@app.route('/download') +def download(): + try: + file = request.args.get('file') + except KeyError: + return Response("must have a file parameter", status=400) + + if alnum_regex.match(file) is None: + return Response(f"file '{file}' was not alphanumeric, bad format", status=400) + + return Response(open(f"/tmp/whisper/{file}.wav", 'rb').read(), mimetype="audio/wav") + + +def run(port: int): + waitress.serve(app, host="0.0.0.0", port=port) + + +if __name__ == "__main__": + normalize_binary = os.environ["WHISPER_NORMALIZE"] + whisper = os.environ["WHISPER_CLIENT"] + index_page_path = os.environ["INDEX_PAGE_PATH"] + ytdlp = os.environ["YT_DLP"] + run(int(os.environ["WHISPER_PORT"])) diff --git a/PulumiWebServer/PulumiWebServer.fsproj b/PulumiWebServer/PulumiWebServer.fsproj index 8d6ed83..f442d14 100644 --- a/PulumiWebServer/PulumiWebServer.fsproj +++ b/PulumiWebServer/PulumiWebServer.fsproj @@ -1,7 +1,7 @@  - net7.0 + net8.0 Exe true @@ -9,10 +9,10 @@ - - + + - + @@ -54,6 +54,8 @@ + + PreserveNewest