dotfiles/analysis/collect_network_libraries.py

#!/usr/bin/env python3
"""Collect network-facing Nix package/library dependency metadata for fern/eisen.

The script intentionally starts from explicit service-facing roots instead of the
full NixOS closure. The full closure includes desktop/session packages and base
system plumbing that are not meaningfully "reachable through network".
"""

from __future__ import annotations

import argparse
import csv
import json
import os
import re
import subprocess
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from collections import deque
from pathlib import Path
from typing import Any


REPO = Path(__file__).resolve().parents[1]
OUT = REPO / "analysis"
HTTP_TIMEOUT = 8


# Higher numbers are processed first. These are the Internet/LAN/Tailscale-facing
# services and containers configured by servers/fern and servers/eisen.
ROOTS = [
    (100, "fern", "service", "caddy", "config.services.caddy.package"),
    (98, "fern", "service", "openssh", "config.programs.ssh.package"),
    (97, "fern", "service", "llama-swap", "config.services.llama-swap.package"),
    (96, "fern", "service", "llama-cpp-server", "pkgs.llama-cpp"),
    (94, "fern", "service", "nix-serve", "config.services.nix-serve.package"),
    (92, "fern", "service", "steam-network-runtime", "config.programs.steam.package"),
    (90, "fern", "service", "kdeconnect", "pkgs.kdePackages.kdeconnect-kde"),
    (88, "fern", "service", "openrgb", "config.services.hardware.openrgb.package"),
    (86, "fern", "service", "docker", "config.virtualisation.docker.package"),
    (100, "eisen", "service", "caddy", "config.services.caddy.package"),
    (99, "eisen", "service", "tailscale", "config.services.tailscale.package"),
    (98, "eisen", "service", "openssh", "config.programs.ssh.package"),
    (97, "eisen", "service", "jellyfin", "config.services.jellyfin.package"),
    (96, "eisen", "service", "sonarr", "config.services.sonarr.package"),
    (95, "eisen", "service", "radarr", "config.services.radarr.package"),
    (94, "eisen", "service", "prowlarr", "config.services.prowlarr.package"),
    (93, "eisen", "service", "karakeep", "config.services.karakeep.package"),
    (92, "eisen", "service", "uptime-kuma", "config.services.uptime-kuma.package"),
    (91, "eisen", "service", "grafana", "config.services.grafana.package"),
    (90, "eisen", "service", "prometheus", "config.services.prometheus.package"),
    (89, "eisen", "service", "prometheus-node-exporter", "pkgs.prometheus-node-exporter"),
    (88, "eisen", "service", "exportarr-sonarr", "pkgs.exportarr"),
    (87, "eisen", "service", "exportarr-radarr", "pkgs.exportarr"),
    (86, "eisen", "service", "exportarr-prowlarr", "pkgs.exportarr"),
    (85, "eisen", "service", "glance", "config.services.glance.package"),
    (84, "eisen", "service", "dnsmasq", "config.services.dnsmasq.package"),
    (83, "eisen", "service", "docker", "config.virtualisation.docker.package"),
    (82, "eisen", "service", "llama-swap-exporter", "pkgs.callPackage ./servers/eisen/llama-swap-exporter/default.nix { }"),
]

CONTAINER_ROOTS = [
    (80, "eisen", "container", "gluetun", "qmcgaw/gluetun"),
    (79, "eisen", "container", "qbittorrent", "lscr.io/linuxserver/qbittorrent"),
    (78, "eisen", "container", "jackett", "lscr.io/linuxserver/jackett"),
    (77, "eisen", "container", "prometheus-qb", "ghcr.io/esanchezm/prometheus-qbittorrent-exporter"),
    (76, "eisen", "container", "tolgee", "tolgee/tolgee"),
]

GITHUB_RE = re.compile(r"github\.com[:/](?P<owner>[^/]+)/(?P<repo>[^/#?]+?)(?:\.git|/|#|\?|$)")
STORE_HASH_PREFIX_RE = re.compile(r"^[0-9a-z]{32}-(?P<name>.+)$")

COMMON_UPSTREAMS = {
    "acl": ("https://git.savannah.nongnu.org/cgit/acl.git", "C"),
    "attr": ("https://git.savannah.nongnu.org/cgit/attr.git", "C"),
    "avahi": ("https://github.com/avahi/avahi", "C"),
    "bluez": ("https://git.kernel.org/pub/scm/bluetooth/bluez.git", "C"),
    "bzip2": ("https://sourceware.org/git/bzip2.git", "C"),
    "curl": ("https://github.com/curl/curl", "C"),
    "dbus": ("https://gitlab.freedesktop.org/dbus/dbus", "C"),
    "double-conversion": ("https://github.com/google/double-conversion", "C++"),
    "ffmpeg": ("https://git.ffmpeg.org/ffmpeg.git", "C"),
    "fuse": ("https://github.com/libfuse/libfuse", "C"),
    "glib": ("https://gitlab.gnome.org/GNOME/glib", "C"),
    "glibc": ("https://sourceware.org/git/glibc.git", "C"),
    "graphviz": ("https://gitlab.com/graphviz/graphviz", "C"),
    "gtk+3": ("https://gitlab.gnome.org/GNOME/gtk", "C"),
    "libarchive": ("https://github.com/libarchive/libarchive", "C"),
    "libbpf": ("https://github.com/libbpf/libbpf", "C"),
    "libbsd": ("https://gitlab.freedesktop.org/libbsd/libbsd", "C"),
    "libcbor": ("https://github.com/PJK/libcbor", "C"),
    "libedit": ("https://www.thrysoee.dk/editline/", "C"),
    "libfido2": ("https://github.com/Yubico/libfido2", "C"),
    "libmnl": ("https://git.netfilter.org/libmnl", "C"),
    "libnftnl": ("https://git.netfilter.org/libnftnl", "C"),
    "libpcap": ("https://github.com/the-tcpdump-group/libpcap", "C"),
    "libuv": ("https://github.com/libuv/libuv", "C"),
    "libxml2": ("https://gitlab.gnome.org/GNOME/libxml2", "C"),
    "libxslt": ("https://gitlab.gnome.org/GNOME/libxslt", "C"),
    "ncurses": ("https://invisible-island.net/ncurses/", "C"),
    "oniguruma": ("https://github.com/kkos/oniguruma", "C"),
    "openssl": ("https://github.com/openssl/openssl", "C"),
    "pcre2": ("https://github.com/PCRE2Project/pcre2", "C"),
    "pcsclite": ("https://pcsclite.apdu.fr/", "C"),
    "rhash": ("https://github.com/rhash/RHash", "C"),
    "sqlite": ("https://sqlite.org/src", "C"),
    "systemd": ("https://github.com/systemd/systemd", "C"),
    "xz": ("https://git.tukaani.org/xz.git", "C"),
    "zlib": ("https://github.com/madler/zlib", "C"),
}

NUGET_NAME_PREFIXES = (
    "AngleSharp",
    "AspNetCore",
    "Azure.",
    "BouncyCastle",
    "Castle.",
    "Dapper",
    "DryIoc",
    "Fluent",
    "HarfBuzzSharp",
    "ICU4N",
    "Jellyfin.",
    "MailKit",
    "MetaBrainz.",
    "Microsoft.",
    "Mono.",
    "NETStandard.",
    "Newtonsoft.",
    "NLog",
    "NodaTime",
    "NuGet.",
    "NUnit",
    "RestSharp",
    "Serilog",
    "Servarr.",
    "SkiaSharp",
    "SQLitePCLRaw",
    "StyleCop.",
    "System.",
    "runtime.",
)

NUGET_REPO_OVERRIDES = {
    "AngleSharp": "https://github.com/AngleSharp/AngleSharp",
    "AngleSharp.Xml": "https://github.com/AngleSharp/AngleSharp.Xml",
    "BitFaster.Caching": "https://github.com/bitfaster/BitFaster.Caching",
    "BlurHashSharp": "https://github.com/MarkusPalcer/BlurHashSharp",
    "BlurHashSharp.SkiaSharp": "https://github.com/MarkusPalcer/BlurHashSharp",
    "BouncyCastle.Cryptography": "https://github.com/bcgit/bc-csharp",
    "Castle.Core": "https://github.com/castleproject/Core",
    "Dapper": "https://github.com/DapperLib/Dapper",
    "DryIoc.dll": "https://github.com/dadhi/DryIoc",
    "DryIoc.Microsoft.DependencyInjection": "https://github.com/dadhi/DryIoc",
    "FluentAssertions": "https://github.com/fluentassertions/fluentassertions",
    "FluentMigrator": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentMigrator.Abstractions": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentMigrator.Extensions.Postgres": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentMigrator.Runner.Core": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentMigrator.Runner.Postgres": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentMigrator.Runner.SQLite": "https://github.com/fluentmigrator/fluentmigrator",
    "FluentValidation": "https://github.com/FluentValidation/FluentValidation",
    "HarfBuzzSharp": "https://github.com/mono/SkiaSharp",
    "HarfBuzzSharp.NativeAssets.Linux": "https://github.com/mono/SkiaSharp",
    "HarfBuzzSharp.NativeAssets.Win32": "https://github.com/mono/SkiaSharp",
    "HarfBuzzSharp.NativeAssets.macOS": "https://github.com/mono/SkiaSharp",
    "ICU4N": "https://github.com/NightOwl888/ICU4N",
    "ICU4N.Transliterator": "https://github.com/NightOwl888/ICU4N",
    "MailKit": "https://github.com/jstedfast/MailKit",
    "MetaBrainz.Common": "https://github.com/Zastai/MetaBrainz.Common",
    "MetaBrainz.Common.Json": "https://github.com/Zastai/MetaBrainz.Common.Json",
    "MetaBrainz.MusicBrainz": "https://github.com/Zastai/MetaBrainz.MusicBrainz",
    "Microsoft.Data.SqlClient": "https://github.com/dotnet/SqlClient",
    "Microsoft.Data.SqlClient.SNI.runtime": "https://github.com/dotnet/SqlClient",
    "Microsoft.Data.Sqlite": "https://github.com/dotnet/efcore",
    "Microsoft.Data.Sqlite.Core": "https://github.com/dotnet/efcore",
    "Newtonsoft.Json": "https://github.com/JamesNK/Newtonsoft.Json",
    "NLog": "https://github.com/NLog/NLog",
    "NodaTime": "https://github.com/nodatime/nodatime",
    "NUnit": "https://github.com/nunit/nunit",
    "NUnit3TestAdapter": "https://github.com/nunit/nunit3-vs-adapter",
    "RestSharp": "https://github.com/restsharp/RestSharp",
    "RestSharp.Serializers.SystemTextJson": "https://github.com/restsharp/RestSharp",
    "Sentry": "https://github.com/getsentry/sentry-dotnet",
    "Serilog": "https://github.com/serilog/serilog",
    "SkiaSharp": "https://github.com/mono/SkiaSharp",
    "SkiaSharp.HarfBuzz": "https://github.com/mono/SkiaSharp",
    "SkiaSharp.NativeAssets.Linux": "https://github.com/mono/SkiaSharp",
    "SkiaSharp.NativeAssets.Win32": "https://github.com/mono/SkiaSharp",
    "SkiaSharp.NativeAssets.macOS": "https://github.com/mono/SkiaSharp",
    "SQLitePCLRaw.bundle_e_sqlite3": "https://github.com/ericsink/SQLitePCL.raw",
    "SQLitePCLRaw.core": "https://github.com/ericsink/SQLitePCL.raw",
    "SQLitePCLRaw.lib.e_sqlite3": "https://github.com/ericsink/SQLitePCL.raw",
    "SQLitePCLRaw.provider.e_sqlite3": "https://github.com/ericsink/SQLitePCL.raw",
    "StyleCop.Analyzers": "https://github.com/DotNetAnalyzers/StyleCopAnalyzers",
}


def run(cmd: list[str], *, timeout: int = 120) -> str:
    proc = subprocess.run(
        cmd,
        cwd=REPO,
        text=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        timeout=timeout,
    )
    if proc.returncode != 0:
        raise RuntimeError(f"command failed: {' '.join(cmd)}\n{proc.stderr}")
    return proc.stdout


def write_json_atomic(path: Path, data: dict[str, Any]) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    tmp.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n")
    tmp.replace(path)


def nix_string(s: str) -> str:
    return json.dumps(s)


def root_expr() -> str:
    rows = []
    for priority, host, kind, name, expr in ROOTS:
        cfg = "flake.nixosConfigurations.fern" if host == "fern" else "flake.colmenaHive.nodes.eisen"
        rows.append(
            "(let node = "
            + cfg
            + "; config = node.config; pkgs = node.pkgs; pkg = "
            + expr
            + "; in mkRoot "
            + str(priority)
            + " "
            + nix_string(host)
            + " "
            + nix_string(kind)
            + " "
            + nix_string(name)
            + " pkg)"
        )

    return """
let
  flake = builtins.getFlake (toString ./.);
  clean = s: builtins.unsafeDiscardStringContext (toString s);
  listOrNull = x: if builtins.isList x then map clean x else if x == null then [] else [ (clean x) ];
  mkRoot = priority: host: kind: rootName: pkg: {
    inherit priority host kind rootName;
    packageName = pkg.name or rootName;
    pname = pkg.pname or null;
    version = pkg.version or null;
    storePath = clean pkg;
    drv = if pkg ? drvPath then clean pkg.drvPath else null;
    homepage = pkg.meta.homepage or null;
    description = pkg.meta.description or null;
    sourceUrls = listOrNull (pkg.src.urls or (pkg.src.url or null));
  };
in [
""" + "\n".join(rows) + "\n]"


def eval_roots() -> list[dict[str, Any]]:
    data = run(["nix", "eval", "--impure", "--json", "--expr", root_expr()], timeout=240)
    roots = json.loads(data)
    for priority, host, kind, name, image in CONTAINER_ROOTS:
        roots.append(
            {
                "priority": priority,
                "host": host,
                "kind": kind,
                "rootName": name,
                "packageName": image,
                "pname": name,
                "version": None,
                "storePath": None,
                "drv": None,
                "homepage": None,
                "description": "OCI image configured in virtualisation.oci-containers",
                "sourceUrls": [],
                "image": image,
            }
        )
    return sorted(roots, key=lambda r: (-int(r["priority"]), r["host"], r["rootName"]))


def derivation_show_recursive(drv: str) -> dict[str, Any]:
    data = run(["nix", "derivation", "show", "-r", drv], timeout=300)
    parsed = json.loads(data)
    # Nix 2.30+ returns {"version": 3, "derivations": {"basename.drv": ...}};
    # older Nix returned {"/nix/store/...drv": ...}. Normalize to basename keys.
    derivations = parsed.get("derivations") if isinstance(parsed, dict) else None
    if isinstance(derivations, dict):
        return derivations
    return {Path(k).name: v for k, v in parsed.items()}


def drv_meta(drv: str, all_drvs: dict[str, Any]) -> dict[str, Any]:
    item = all_drvs.get(Path(drv).name, all_drvs.get(drv, {}))
    env = item.get("env", {})
    name = clean_library_name(env.get("pname") or env.get("name") or Path(drv).name.removesuffix(".drv"))
    return {
        "name": name,
        "version": env.get("version"),
        "homepage": env.get("homepage") or env.get("meta.homepage"),
        "description": env.get("meta.description") or env.get("description"),
        "source_link": source_from_env(env),
        "language": infer_language(name, env),
    }


def clean_library_name(name: str) -> str:
    match = STORE_HASH_PREFIX_RE.match(name)
    if match:
        name = match.group("name")
    for suffix in (".nupkg", ".tar.gz", ".tar.xz", ".zip", ".drv"):
        if name.endswith(suffix):
            name = name[: -len(suffix)]
    return name


def source_from_env(env: dict[str, str]) -> str | None:
    for key in ("src", "urls", "url", "cargoDeps", "npmDeps", "goModules"):
        val = env.get(key)
        if val and ("http" in val or "github" in val):
            return val
    for key, val in env.items():
        if key.lower().endswith("url") and val and ("http" in val or "github" in val):
            return val
    return None


def infer_language(name: str, env: dict[str, str]) -> str | None:
    text = " ".join([name, env.get("nativeBuildInputs", ""), env.get("buildInputs", "")]).lower()
    if "python" in text or name.startswith("python"):
        return "Python"
    if "cargo" in text or "rustc" in text:
        return "Rust"
    if "go" in text and ("gomod" in text or "goModules" in env):
        return "Go"
    if "node" in text or "npm" in text or "pnpm" in text or "yarn" in text:
        return "JavaScript/TypeScript"
    if "cmake" in text or "gcc" in text or "clang" in text:
        return "C/C++"
    if name.startswith(("qt", "k", "lib")):
        return "C/C++"
    return None


def static_upstream(name: str) -> dict[str, str] | None:
    base = re.sub(r"-\d+(?:\.\d+).*$", "", name)
    if base in COMMON_UPSTREAMS:
        source, language = COMMON_UPSTREAMS[base]
        return {"source_link": source, "language": language}
    if name.startswith("qt") or name in {"qca", "phonon", "poppler"}:
        return {"source_link": f"https://code.qt.io/cgit/qt/{base}.git", "language": "C++"}
    if name.startswith("gst-") or name == "gstreamer":
        project = "gstreamer" if name == "gstreamer" else base
        return {"source_link": f"https://gitlab.freedesktop.org/gstreamer/{project}", "language": "C"}
    kde_prefixes = (
        "karchive",
        "kauth",
        "kbookmarks",
        "kcmutils",
        "kcodecs",
        "kcompletion",
        "kconfig",
        "kconfigwidgets",
        "kcoreaddons",
        "kcrash",
        "kdbusaddons",
        "kdeclarative",
        "kded",
        "kdnssd",
        "kdoctools",
        "kfilemetadata",
        "kguiaddons",
        "ki18n",
        "kiconthemes",
        "kidletime",
        "kio",
        "kirigami",
        "kitemmodels",
        "kitemviews",
        "kjobwidgets",
        "knotifications",
        "kpackage",
        "kparts",
        "kpeople",
        "kpty",
        "kservice",
        "kstatusnotifieritem",
        "ksvg",
        "ktextwidgets",
        "kwallet",
        "kwidgetsaddons",
        "kwindowsystem",
        "kxmlgui",
        "solid",
        "sonnet",
        "syntax-highlighting",
    )
    if base.startswith(kde_prefixes):
        return {"source_link": f"https://invent.kde.org/frameworks/{base}", "language": "C++"}
    return None


def github_repo(*values: str | None) -> str | None:
    for value in values:
        if not value:
            continue
        match = GITHUB_RE.search(value)
        if match:
            return f"{match.group('owner')}/{match.group('repo')}"
    return None


def noisy_for_review(row: dict[str, Any]) -> bool:
    name = row["library"].lower()
    drv_path = row.get("drv_path", "").lower()
    if ".nupkg" in drv_path and not row.get("version_in_use"):
        return True
    noisy_exact = {
        "bash",
        "coreutils",
        "coreutils-full",
        "stdenv-linux",
        "install-shell-files",
        "version-check-hook",
        "writable-tmpdir-as-home-hook",
        "auto-patchelf-hook",
        "pkg-config-wrapper",
        "gcc-wrapper",
        "gnumake",
        "cmake",
        "ninja",
        "patchelf",
        "remove-references-to",
        "strip-nondeterminism",
    }
    if name in noisy_exact:
        return True
    noisy_bits = (
        "-source",
        "source-",
        "-go-modules",
        "builder.sh",
        "setup-hook",
        "-hook",
        ".patch",
        ".diff",
        "testdata",
        "fixture",
    )
    return any(bit in name for bit in noisy_bits)


def github_json(path: str) -> dict[str, Any] | None:
    req = urllib.request.Request(
        f"https://api.github.com/{path}",
        headers={"Accept": "application/vnd.github+json", "User-Agent": "dotfiles-analysis"},
    )
    try:
        with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as res:
            return json.loads(res.read().decode())
    except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError):
        return None


def http_json(url: str) -> dict[str, Any] | None:
    req = urllib.request.Request(
        url,
        headers={"Accept": "application/json", "User-Agent": "dotfiles-analysis"},
    )
    try:
        with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as res:
            return json.loads(res.read().decode())
    except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, json.JSONDecodeError):
        return None


def normalize_repo_url(value: str | None) -> str | None:
    if not value:
        return None
    value = value.strip()
    if value.startswith("git+"):
        value = value[4:]
    if value.startswith("git://github.com/"):
        value = "https://github.com/" + value.removeprefix("git://github.com/")
    if value.startswith("git@github.com:"):
        value = "https://github.com/" + value.removeprefix("git@github.com:")
    if value.endswith(".git"):
        value = value[:-4]
    return value


def parse_ecosystem(row: dict[str, Any]) -> tuple[str | None, str | None, str | None]:
    name = row["library"]
    version = row.get("version_in_use") or None
    drv = row.get("drv_path", "")
    is_nuget_like = name.startswith(NUGET_NAME_PREFIXES)
    if ".nupkg" in drv or is_nuget_like:
        # The derivation rows have clean name/version; the raw .nupkg rows are
        # filtered from review but can still be enriched in summary/deps.
        if not version and ".nupkg" in drv:
            base = clean_library_name(Path(drv).name.removesuffix(".drv"))
            m = re.match(r"(.+)\.(\d+(?:\.\d+)+(?:[-.][0-9A-Za-z]+)*)$", base)
            if m:
                name, version = m.group(1), m.group(2)
        return "nuget", name, version
    if row["root_name"] in {"nix-serve"} and "perl5." in drv:
        return "cpan", name, version
    if name.startswith("python") or "python" in drv:
        py_name = re.sub(r"^python\d+(?:\.\d+)?-", "", name)
        return "pypi", py_name, version
    if "node_modules" in row.get("dependency_path", "") or row["root_name"] in {"karakeep", "uptime-kuma"}:
        return "npm", name, version
    if "cargo" in name.lower() or "rust" in drv.lower():
        return "crates", name, version
    return None, None, None


def apply_ecosystem_overrides(ecosystem: str, package: str, result: dict[str, Any]) -> dict[str, Any]:
    if ecosystem == "nuget":
        source = NUGET_REPO_OVERRIDES.get(package)
        if not source and package.startswith("Microsoft.AspNetCore."):
            source = "https://github.com/dotnet/aspnetcore"
        if not source and package.startswith("Microsoft.EntityFrameworkCore"):
            source = "https://github.com/dotnet/efcore"
        if not source and package.startswith("Microsoft.Build"):
            source = "https://github.com/dotnet/msbuild"
        if not source and package.startswith("Microsoft.Identity.Client"):
            source = "https://github.com/AzureAD/microsoft-authentication-library-for-dotnet"
        if not source and (package.startswith("Microsoft.") or package.startswith("System.") or package.startswith("runtime.")):
            source = "https://github.com/dotnet/runtime"
        if source:
            result["source_link"] = source
            result["github_repo"] = github_repo(source)
            result.setdefault("language", "C#")
    return result


def release_date_from_pypi(files: list[dict[str, Any]]) -> str | None:
    dates = [f.get("upload_time_iso_8601") for f in files if f.get("upload_time_iso_8601")]
    return min(dates) if dates else None


def enrich_ecosystem(ecosystem: str, package: str, version: str | None, cache: dict[str, Any]) -> dict[str, Any]:
    key = ecosystem_cache_key(ecosystem, package, version)
    if key in cache:
        return cache[key]
    result: dict[str, Any] = {"ecosystem": ecosystem}
    quoted = urllib.parse.quote(package, safe="")

    if ecosystem == "nuget":
        result["language"] = "C#"
        if version:
            data = http_json(f"https://api.nuget.org/v3/registration5-semver1/{package.lower()}/{version.lower()}.json")
            entry = (data or {}).get("catalogEntry", {})
            if isinstance(entry, str):
                entry = http_json(entry) or {}
            repo = entry.get("repository") or {}
            repo_url = repo.get("url") if isinstance(repo, dict) else None
            repo_url = normalize_repo_url(repo_url or entry.get("repositoryUrl") or entry.get("projectUrl"))
            result.update(
                {
                    "source_link": repo_url or entry.get("projectUrl"),
                    "release_date": entry.get("published"),
                }
            )
        index = http_json(f"https://api.nuget.org/v3-flatcontainer/{package.lower()}/index.json")
        versions = (index or {}).get("versions") or []
        if versions:
            result["latest_version"] = versions[-1]

    elif ecosystem == "npm":
        data = http_json(f"https://registry.npmjs.org/{quoted}") or {}
        info = data.get("versions", {}).get(version or "", {}) if version else {}
        repo = info.get("repository") or data.get("repository") or {}
        repo_url = repo.get("url") if isinstance(repo, dict) else repo
        latest = (data.get("dist-tags") or {}).get("latest")
        result.update(
            {
                "source_link": normalize_repo_url(repo_url) or data.get("homepage"),
                "latest_version": latest,
                "release_date": (data.get("time") or {}).get(version or ""),
                "latest_release_date": (data.get("time") or {}).get(latest or ""),
                "language": "JavaScript/TypeScript",
            }
        )

    elif ecosystem == "pypi":
        data = http_json(f"https://pypi.org/pypi/{quoted}/json") or {}
        info = data.get("info", {})
        urls = info.get("project_urls") or {}
        source = urls.get("Source") or urls.get("Source Code") or urls.get("Homepage") or info.get("home_page") or info.get("package_url")
        latest = info.get("version")
        result.update(
            {
                "source_link": normalize_repo_url(source),
                "latest_version": latest,
                "release_date": release_date_from_pypi((data.get("releases") or {}).get(version or "", [])),
                "latest_release_date": release_date_from_pypi((data.get("releases") or {}).get(latest or "", [])),
                "language": "Python",
            }
        )

    elif ecosystem == "crates":
        data = http_json(f"https://crates.io/api/v1/crates/{quoted}") or {}
        crate = data.get("crate", {})
        result.update(
            {
                "source_link": normalize_repo_url(crate.get("repository") or crate.get("homepage")),
                "latest_version": crate.get("max_stable_version") or crate.get("newest_version"),
                "latest_release_date": crate.get("updated_at"),
                "language": "Rust",
            }
        )

    elif ecosystem == "cpan":
        dist = package.replace("::", "-")
        result.update(
            {
                "source_link": f"https://metacpan.org/pod/{package}",
                "language": "Perl",
            }
        )
        data = http_json(f"https://fastapi.metacpan.org/v1/release/{urllib.parse.quote(dist, safe='')}") or {}
        resources = ((data.get("metadata") or {}).get("resources") or {})
        repo = resources.get("repository") or {}
        repo_url = repo.get("url") if isinstance(repo, dict) else repo
        result.update(
            {
                "source_link": normalize_repo_url(repo_url) or result["source_link"],
                "latest_version": data.get("version"),
                "latest_release_date": data.get("date"),
            }
        )
        if str(data.get("version")) == str(version):
            result["release_date"] = data.get("date")

    result["github_repo"] = github_repo(result.get("source_link"))
    result = apply_ecosystem_overrides(ecosystem, package, result)
    cache[key] = result
    return result


def ecosystem_cache_key(ecosystem: str | None, package: str | None, version: str | None) -> str:
    return f"{ecosystem}:{package}:{version or ''}"


def enrich_github(repo: str, cache: dict[str, Any], sleep: float) -> dict[str, Any]:
    if repo in cache:
        return cache[repo]
    data = github_json(f"repos/{repo}") or {}
    if sleep:
        time.sleep(sleep)
    latest = github_json(f"repos/{repo}/releases/latest") or {}
    if sleep:
        time.sleep(sleep)
    result = {
        "github_repo": repo,
        "github_stars": data.get("stargazers_count"),
        "language": data.get("language"),
        "source_link": data.get("html_url"),
        "latest_version": latest.get("tag_name"),
        "latest_release_date": latest.get("published_at"),
    }
    cache[repo] = result
    return result


def walk_deps(root: dict[str, Any], all_drvs: dict[str, Any], max_depth: int) -> list[dict[str, Any]]:
    start = root.get("drv")
    if not start:
        return []
    start_key = Path(start).name
    rows = []
    seen = {start_key}
    queue = deque([(start_key, [], 0)])
    while queue:
        drv, path, depth = queue.popleft()
        if depth >= max_depth:
            continue
        item = all_drvs.get(drv, {})
        input_drvs = item.get("inputDrvs") or (item.get("inputs") or {}).get("drvs") or {}
        for dep_drv in sorted(input_drvs.keys()):
            dep_key = Path(dep_drv).name
            if dep_key in seen:
                continue
            seen.add(dep_key)
            meta = drv_meta(dep_key, all_drvs)
            static = static_upstream(meta["name"]) or {}
            source_link = meta["source_link"] or static.get("source_link")
            language = meta["language"] or static.get("language")
            dep_path = path + [meta["name"]]
            rows.append(
                {
                    "host": root["host"],
                    "root_kind": root["kind"],
                    "root_name": root["rootName"],
                    "root_package": root["packageName"],
                    "library": meta["name"],
                    "version_in_use": meta["version"],
                    "dep_depth": depth + 1,
                    "dependency_path": " -> ".join([root["rootName"]] + dep_path),
                    "drv_path": dep_key,
                    "homepage": meta["homepage"],
                    "source_link": source_link,
                    "language": language,
                    "github_repo": github_repo(meta["homepage"], source_link),
                    "github_stars": None,
                    "ecosystem": None,
                    "release_date": None,
                    "latest_version": None,
                    "latest_release_date": None,
                }
            )
            queue.append((dep_key, dep_path, depth + 1))
    return rows


def write_csv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None:
    with path.open("w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)


def ecosystem_priority(ecosystem: str | None) -> int:
    return {
        "cpan": 0,
        "npm": 1,
        "pypi": 2,
        "crates": 3,
        "nuget": 4,
    }.get(ecosystem or "", 9)


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--max-roots", type=int, default=18)
    parser.add_argument("--max-depth", type=int, default=2)
    parser.add_argument("--github-limit", type=int, default=80)
    parser.add_argument("--github-sleep", type=float, default=0.1)
    parser.add_argument("--ecosystem-limit", type=int, default=400)
    args = parser.parse_args()

    OUT.mkdir(exist_ok=True)
    roots = eval_roots()
    selected = [r for r in roots if r.get("drv")][: args.max_roots]

    dep_rows: list[dict[str, Any]] = []
    for root in selected:
        print(f"walking {root['host']}:{root['rootName']} {root['packageName']}", file=sys.stderr)
        try:
            all_drvs = derivation_show_recursive(root["drv"])
        except RuntimeError as exc:
            print(exc, file=sys.stderr)
            continue
        dep_rows.extend(walk_deps(root, all_drvs, args.max_depth))

    ecosystem_cache_path = OUT / "ecosystem-cache.json"
    ecosystem_cache = json.loads(ecosystem_cache_path.read_text()) if ecosystem_cache_path.exists() else {}
    ecosystem_keys = []
    ecosystem_key_scores: dict[tuple[str | None, str | None, str | None], tuple[int, int, str, str]] = {}
    ecosystem_rows: dict[tuple[str | None, str | None, str | None], list[dict[str, Any]]] = {}
    for row in dep_rows:
        ecosystem, package, version = parse_ecosystem(row)
        if ecosystem and package:
            key = (ecosystem, package, version)
            ecosystem_rows.setdefault(key, []).append(row)
            if key not in ecosystem_keys:
                ecosystem_keys.append(key)
            review_score = 0 if not noisy_for_review(row) else 1
            score = (review_score, ecosystem_priority(ecosystem), package.lower(), version or "")
            if key not in ecosystem_key_scores or score < ecosystem_key_scores[key]:
                ecosystem_key_scores[key] = score
    ecosystem_keys.sort(key=lambda key: ecosystem_key_scores.get(key, (9, 9, "", "")))
    selected_ecosystem_keys = ecosystem_keys if args.ecosystem_limit < 0 else ecosystem_keys[: args.ecosystem_limit]
    for idx, (ecosystem, package, version) in enumerate(selected_ecosystem_keys, start=1):
        if idx % 25 == 1:
            print(f"enriching ecosystem metadata {idx}/{len(selected_ecosystem_keys)}", file=sys.stderr)
        meta = enrich_ecosystem(ecosystem, package, version, ecosystem_cache)
        for row in ecosystem_rows.get((ecosystem, package, version), []):
            row.update({k: v for k, v in meta.items() if v is not None and (not row.get(k) or k in {"ecosystem", "release_date"})})
        if idx % 25 == 0:
            write_json_atomic(ecosystem_cache_path, ecosystem_cache)
    selected_ecosystem_key_set = set(selected_ecosystem_keys)
    cached_only_keys = [
        key
        for key in ecosystem_keys
        if ecosystem_cache_key(*key) in ecosystem_cache and key not in selected_ecosystem_key_set
    ]
    for ecosystem, package, version in cached_only_keys:
        meta = ecosystem_cache[ecosystem_cache_key(ecosystem, package, version)]
        for row in ecosystem_rows.get((ecosystem, package, version), []):
            row.update({k: v for k, v in meta.items() if v is not None and (not row.get(k) or k in {"ecosystem", "release_date"})})
    write_json_atomic(ecosystem_cache_path, ecosystem_cache)

    cache_path = OUT / "github-cache.json"
    cache = json.loads(cache_path.read_text()) if cache_path.exists() else {}
    repos = []
    for row in dep_rows:
        repo = row.get("github_repo")
        if repo and repo not in repos:
            repos.append(repo)
    for idx, repo in enumerate(repos[: args.github_limit], start=1):
        if idx % 25 == 1:
            print(f"enriching GitHub metadata {idx}/{min(len(repos), args.github_limit)}", file=sys.stderr)
        gh = enrich_github(repo, cache, args.github_sleep)
        for row in dep_rows:
            if row.get("github_repo") == repo:
                row.update({k: v for k, v in gh.items() if v is not None})
        if idx % 25 == 0:
            write_json_atomic(cache_path, cache)

    for root in roots:
        repo = github_repo(root.get("homepage"), " ".join(root.get("sourceUrls") or []))
        root["github_repo"] = repo
        root["github_stars"] = None
        root["ecosystem"] = "nix"
        root["release_date"] = None
        root["latest_version"] = None
        root["latest_release_date"] = None
        root["language"] = None
        if repo:
            gh = enrich_github(repo, cache, args.github_sleep)
            root.update({k: v for k, v in gh.items() if v is not None})
    write_json_atomic(cache_path, cache)

    root_fields = [
        "priority",
        "host",
        "kind",
        "rootName",
        "packageName",
        "pname",
        "version",
        "drv",
        "storePath",
        "homepage",
        "description",
        "sourceUrls",
        "image",
        "github_repo",
        "github_stars",
        "ecosystem",
        "release_date",
        "latest_version",
        "latest_release_date",
        "language",
    ]
    dep_fields = [
        "host",
        "root_kind",
        "root_name",
        "root_package",
        "library",
        "version_in_use",
        "dep_depth",
        "dependency_path",
        "drv_path",
        "homepage",
        "source_link",
        "github_repo",
        "github_stars",
        "ecosystem",
        "release_date",
        "latest_version",
        "latest_release_date",
        "language",
    ]
    write_csv(OUT / "network-package-roots.csv", roots, root_fields)
    write_csv(OUT / "network-library-deps.csv", dep_rows, dep_fields)

    # One row per library, preserving the first root/path encountered. This is
    # convenient for hand-reviewing uncommon deps before opening the full edge CSV.
    summary: dict[str, dict[str, Any]] = {}
    for row in dep_rows:
        key = row["drv_path"]
        summary.setdefault(key, row.copy())
    write_csv(
        OUT / "network-library-summary.csv",
        sorted(summary.values(), key=lambda r: (r.get("github_stars") is not None, r.get("github_stars") or 0, r["library"])),
        dep_fields,
    )
    review_rows = [r for r in summary.values() if not noisy_for_review(r)]
    write_csv(
        OUT / "network-library-review.csv",
        sorted(review_rows, key=lambda r: (r.get("github_stars") is not None, r.get("github_stars") or 0, r["library"])),
        dep_fields,
    )
    print(f"wrote {len(roots)} roots and {len(dep_rows)} dependency rows", file=sys.stderr)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())