dotfiles/servers/fern/configuration.nix

{
  pkgs,
  lib,
  config,
  ...
}:

let
  llama-cpp = (
    (pkgs.llama-cpp.override {
      cudaSupport = true;
      rocmSupport = false;
      metalSupport = false;
      blasSupport = true;
    }).overrideAttrs
      (prevAttrs: rec {
        preConfigure = ''
          export NIX_ENFORCE_NO_NATIVE=0
          ${prevAttrs.preConfigure or ""}
        '';
        version = "8999";
        src = pkgs.fetchFromGitHub {
          owner = "ggml-org";
          repo = "llama.cpp";
          tag = "b${version}";
          hash = "sha256-EgJ3Die/WpVm9dtQ2kwXoV4RAWNY9x7lT4wun79qqCI=";
          leaveDotGit = true;
          postFetch = ''
            git -C "$out" rev-parse --short HEAD > $out/COMMIT
            find "$out" -name .git -print0 | xargs -0 rm -rf
          '';
        };
        npmDepsHash = "sha256-k62LIbyY2DXvs7XXbX0lNPiYxuYzeJUyQtS4eA+68f8=";
        cmakeFlags = with pkgs.lib; [
          # -march=native is non-deterministic; override with platform-specific flags if needed
          (cmakeBool "GGML_NATIVE" true)
          (cmakeBool "LLAMA_BUILD_EXAMPLES" false)
          (cmakeBool "LLAMA_BUILD_SERVER" true)
          (cmakeBool "LLAMA_BUILD_TESTS" false)
          (cmakeBool "LLAMA_OPENSSL" true)
          (cmakeBool "BUILD_SHARED_LIBS" true)
          # (cmakeBool "GGML_BLAS" false)
          (cmakeBool "GGML_LTO" true)
          (cmakeBool "GGML_CLBLAST" true)
          (cmakeBool "GGML_CUDA" true)
          (cmakeBool "GGML_CUDA_GRAPHS" true)
          (cmakeBool "GGML_CUDA_F16" true)
          (cmakeBool "GGML_CUDA_FA_ALL_QUANTS" true)
          # (cmakeBool "GGML_HIP" false)
          # (cmakeBool "GGML_METAL" false)
          # (cmakeBool "GGML_RPC" false)
          # (cmakeBool "GGML_VULKAN" false)
          (cmakeFeature "LLAMA_BUILD_NUMBER" "8667")
          (cmakeFeature "CMAKE_CUDA_ARCHITECTURES" "120")
        ];
      })
  );
in
{
  services.hardware.openrgb.enable = true;
  # ssh -R (remote port forward) to this server should listen publicly
  services.openssh.settings.GatewayPorts = "yes";
  boot = {
    kernelParams = [
      # attempt to fix nvidia perf
      "nvidia_drm.fbdev=1"
      "nvidia_drm.modeset=1"
      "module_blacklist=i915"
      "delayacct"
      "initcall_blacklist=sysfb_init"
      #"quiet"
      #"splash"
      "boot.shell_on_fail"
      "loglevel=3"
      "rd.systemd.show_status=false"
      "rd.udev.log_level=3"
      "udev.log_priority=3"
    ];
  };

  # vr
  # services.monado = {
  #   enable = false;
  #   defaultRuntime = true; # Register as default OpenXR runtime
  # };
  # systemd.user.services.monado.environment = {
  #   STEAMVR_LH_ENABLE = "1";
  #   XRT_COMPOSITOR_COMPUTE = "1";
  #   WMR_HANDTRACKING = "0";
  #   VIT_SYSTEM_LIBRARY_PATH = "${pkgs.basalt-monado}/lib/libbasalt.so";
  # };
  programs.steam = {
    enable = true;
    package = pkgs.steam.override {
      extraProfile = ''
        # Fixes timezones on VRChat
        unset TZ
        # Allows Monado/WiVRn to be used
        export PRESSURE_VESSEL_IMPORT_OPENXR_1_RUNTIMES=1
      '';
    };
  };

  services.paseo = {
    enable = true;
    relay.enable = false;
    user = "dan";
    group = "users";
    port = 5656;
    openFirewall = true;
  };

  hardware.cpu.amd.updateMicrocode = true;

  hardware.graphics = {
    enable = true;
    # package = unstable-pkgs.mesa.drivers;
    # Steam support
    enable32Bit = true;
    # package32 = unstable-pkgs.pkgsi686Linux.mesa.drivers;
    extraPackages = with pkgs; [
      nvidia-vaapi-driver
    ];
  };
  environment.systemPackages = with pkgs; [
    nvitop
    # basalt-monado
    cudaPackages.cuda_nvcc
    llama-cpp
  ];
  services.llama-swap = {
    enable = true;
    openFirewall = true;
    settings = {
      #      listen = "0.0.0.0:8080";
      macros = {
        llama = ''
          ${pkgs.lib.getExe' llama-cpp "llama-server"} \
          --port ${"\${PORT}"} \
          --alias "unsloth/qwen" \
          --no-webui \
          --ctx-size 131072 \
          --fit on --fit-ctx 131072 --fit-target 256 \
          --temp 1.0 --top-p 0.95 --top-k 64 \
          --repeat-penalty 1.0 \
          -ctk q8_0 -ctv q8_0 \
          --flash-attn on \
          --batch-size 1024 --ubatch-size 512 \
          --threads 12 --threads-batch 12 \
          --no-mmap --mlock \
          --parallel 1 --prio 2 --no-warmup --jinja
        '';
        models_dir = "\${env.HOME}/models";
      };
      globalTTL = 3600;
      models = {
        # qwen3-embedding-8b = {
        # };
        # "qwen3-embedding-0.6" = { };
        "qwen3.6-35B-A3B" = {
          cmd = "\${llama} -m /home/dan/.lmstudio/models/unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf";
        };
        "gemma-4-26B-A4B" = {
          cmd = "\${llama} -m /home/dan/.lmstudio/models/lmstudio-community/gemma-4-26B-A4B-it-GGUF/gemma-4-26B-A4B-it-Q4_K_M.gguf";
        };
        "qwen3.5-9B" = {
          cmd = "\${llama} -m /home/dan/.lmstudio/models/lmstudio-community/Qwen3.5-9B-GGUF/Qwen3.5-9B-Q4_K_M.gguf";
        };
        "qwen3.5-9B-sushi" = {
          cmd = "\${llama} -m /home/dan/.lmstudio/models/bigatuna/Qwen3.5-9b-Sushi-Coder-RL-GGUF/Qwen3.5-9b-Sushi-Coder-RL.Q4_K_M.gguf";
        };
      };
    };
  };
  systemd.services.llama-swap = {
    environment = {
      HOME = "/home/dan";
    };
    serviceConfig = {
      ProtectHome = pkgs.lib.mkForce false;
      DynamicUser = pkgs.lib.mkForce false;
      User = pkgs.lib.mkForce "dan";
      Group = pkgs.lib.mkForce "users"; # or dan's primary group
      ExecStart = lib.mkForce ''
        ${lib.getExe pkgs.llama-swap} --listen 0.0.0.0:${toString config.services.llama-swap.port} --config ${
          (pkgs.formats.yaml { }).generate "config.yaml" config.services.llama-swap.settings
        }
      '';
    };
  };

  services.nix-serve = {
    enable = true;
    secretKeyFile = "/var/secrets/cache-private-key.pem";
  };

  services.caddy = {
    enable = true;

    virtualHosts = {
      "llama.fern.danbulant.cloud:80" = {
        extraConfig = ''
          reverse_proxy http://localhost:${toString config.services.llama-swap.port}
        '';
      };
      "nix.fern.danbulant.cloud:80" = {
        extraConfig = ''
          reverse_proxy http://localhost:${toString config.services.nix-serve.port}
        '';
      };
    };
  };

  nix.optimise = {
    automatic = true;
    persistent = true;
  };
  nix.gc = {
    automatic = true;
    persistent = true;
  };

  hardware.nvidia = {
    open = true;
    modesetting.enable = true;
    # powerManagement.enable = true;
    nvidiaSettings = true;
  };
  services.xserver.videoDrivers = [ "nvidia" ];
  # powerManagement.enable = true;
  hardware.nvidia-container-toolkit.enable = true;
  virtualisation.docker.daemon.settings.features.cdi = true;
}