From 12c35ba94208891e67b0d5f0f27603d2f25a1e20 Mon Sep 17 00:00:00 2001 From: Daniel Bulant Date: Sat, 30 May 2026 17:50:12 +0200 Subject: [PATCH] initial analysis script --- analysis/.gitignore | 3 + analysis/collect_network_libraries.py | 661 ++++++++++++++++++++++++++ 2 files changed, 664 insertions(+) create mode 100644 analysis/.gitignore create mode 100644 analysis/collect_network_libraries.py diff --git a/analysis/.gitignore b/analysis/.gitignore new file mode 100644 index 0000000..cae2cef --- /dev/null +++ b/analysis/.gitignore @@ -0,0 +1,3 @@ +*.json +*.csv +__pycache__ diff --git a/analysis/collect_network_libraries.py b/analysis/collect_network_libraries.py new file mode 100644 index 0000000..b3da0f8 --- /dev/null +++ b/analysis/collect_network_libraries.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python3 +"""Collect network-facing Nix package/library dependency metadata for fern/eisen. + +The script intentionally starts from explicit service-facing roots instead of the +full NixOS closure. The full closure includes desktop/session packages and base +system plumbing that are not meaningfully "reachable through network". +""" + +from __future__ import annotations + +import argparse +import csv +import json +import os +import re +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from collections import deque +from pathlib import Path +from typing import Any + + +REPO = Path(__file__).resolve().parents[1] +OUT = REPO / "analysis" +HTTP_TIMEOUT = 8 + + +# Higher numbers are processed first. These are the Internet/LAN/Tailscale-facing +# services and containers configured by servers/fern and servers/eisen. +ROOTS = [ + (100, "fern", "service", "caddy", "config.services.caddy.package"), + (98, "fern", "service", "openssh", "config.programs.ssh.package"), + (97, "fern", "service", "llama-swap", "config.services.llama-swap.package"), + (96, "fern", "service", "llama-cpp-server", "pkgs.llama-cpp"), + (94, "fern", "service", "nix-serve", "config.services.nix-serve.package"), + (92, "fern", "service", "steam-network-runtime", "config.programs.steam.package"), + (90, "fern", "service", "kdeconnect", "pkgs.kdePackages.kdeconnect-kde"), + (88, "fern", "service", "openrgb", "config.services.hardware.openrgb.package"), + (86, "fern", "service", "docker", "config.virtualisation.docker.package"), + (100, "eisen", "service", "caddy", "config.services.caddy.package"), + (99, "eisen", "service", "tailscale", "config.services.tailscale.package"), + (98, "eisen", "service", "openssh", "config.programs.ssh.package"), + (97, "eisen", "service", "jellyfin", "config.services.jellyfin.package"), + (96, "eisen", "service", "sonarr", "config.services.sonarr.package"), + (95, "eisen", "service", "radarr", "config.services.radarr.package"), + (94, "eisen", "service", "prowlarr", "config.services.prowlarr.package"), + (93, "eisen", "service", "karakeep", "config.services.karakeep.package"), + (92, "eisen", "service", "uptime-kuma", "config.services.uptime-kuma.package"), + (91, "eisen", "service", "grafana", "config.services.grafana.package"), + (90, "eisen", "service", "prometheus", "config.services.prometheus.package"), + (89, "eisen", "service", "prometheus-node-exporter", "pkgs.prometheus-node-exporter"), + (88, "eisen", "service", "exportarr-sonarr", "pkgs.exportarr"), + (87, "eisen", "service", "exportarr-radarr", "pkgs.exportarr"), + (86, "eisen", "service", "exportarr-prowlarr", "pkgs.exportarr"), + (85, "eisen", "service", "glance", "config.services.glance.package"), + (84, "eisen", "service", "dnsmasq", "config.services.dnsmasq.package"), + (83, "eisen", "service", "docker", "config.virtualisation.docker.package"), + (82, "eisen", "service", "llama-swap-exporter", "pkgs.callPackage ./servers/eisen/llama-swap-exporter/default.nix { }"), +] + +CONTAINER_ROOTS = [ + (80, "eisen", "container", "gluetun", "qmcgaw/gluetun"), + (79, "eisen", "container", "qbittorrent", "lscr.io/linuxserver/qbittorrent"), + (78, "eisen", "container", "jackett", "lscr.io/linuxserver/jackett"), + (77, "eisen", "container", "prometheus-qb", "ghcr.io/esanchezm/prometheus-qbittorrent-exporter"), + (76, "eisen", "container", "tolgee", "tolgee/tolgee"), +] + +GITHUB_RE = re.compile(r"github\.com[:/](?P[^/]+)/(?P[^/#?]+?)(?:\.git|/|#|\?|$)") +STORE_HASH_PREFIX_RE = re.compile(r"^[0-9a-z]{32}-(?P.+)$") + + +def run(cmd: list[str], *, timeout: int = 120) -> str: + proc = subprocess.run( + cmd, + cwd=REPO, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + ) + if proc.returncode != 0: + raise RuntimeError(f"command failed: {' '.join(cmd)}\n{proc.stderr}") + return proc.stdout + + +def write_json_atomic(path: Path, data: dict[str, Any]) -> None: + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n") + tmp.replace(path) + + +def nix_string(s: str) -> str: + return json.dumps(s) + + +def root_expr() -> str: + rows = [] + for priority, host, kind, name, expr in ROOTS: + cfg = "flake.nixosConfigurations.fern" if host == "fern" else "flake.colmenaHive.nodes.eisen" + rows.append( + "(let node = " + + cfg + + "; config = node.config; pkgs = node.pkgs; pkg = " + + expr + + "; in mkRoot " + + str(priority) + + " " + + nix_string(host) + + " " + + nix_string(kind) + + " " + + nix_string(name) + + " pkg)" + ) + + return """ +let + flake = builtins.getFlake (toString ./.); + clean = s: builtins.unsafeDiscardStringContext (toString s); + listOrNull = x: if builtins.isList x then map clean x else if x == null then [] else [ (clean x) ]; + mkRoot = priority: host: kind: rootName: pkg: { + inherit priority host kind rootName; + packageName = pkg.name or rootName; + pname = pkg.pname or null; + version = pkg.version or null; + storePath = clean pkg; + drv = if pkg ? drvPath then clean pkg.drvPath else null; + homepage = pkg.meta.homepage or null; + description = pkg.meta.description or null; + sourceUrls = listOrNull (pkg.src.urls or (pkg.src.url or null)); + }; +in [ +""" + "\n".join(rows) + "\n]" + + +def eval_roots() -> list[dict[str, Any]]: + data = run(["nix", "eval", "--impure", "--json", "--expr", root_expr()], timeout=240) + roots = json.loads(data) + for priority, host, kind, name, image in CONTAINER_ROOTS: + roots.append( + { + "priority": priority, + "host": host, + "kind": kind, + "rootName": name, + "packageName": image, + "pname": name, + "version": None, + "storePath": None, + "drv": None, + "homepage": None, + "description": "OCI image configured in virtualisation.oci-containers", + "sourceUrls": [], + "image": image, + } + ) + return sorted(roots, key=lambda r: (-int(r["priority"]), r["host"], r["rootName"])) + + +def derivation_show_recursive(drv: str) -> dict[str, Any]: + data = run(["nix", "derivation", "show", "-r", drv], timeout=300) + parsed = json.loads(data) + # Nix 2.30+ returns {"version": 3, "derivations": {"basename.drv": ...}}; + # older Nix returned {"/nix/store/...drv": ...}. Normalize to basename keys. + derivations = parsed.get("derivations") if isinstance(parsed, dict) else None + if isinstance(derivations, dict): + return derivations + return {Path(k).name: v for k, v in parsed.items()} + + +def drv_meta(drv: str, all_drvs: dict[str, Any]) -> dict[str, Any]: + item = all_drvs.get(Path(drv).name, all_drvs.get(drv, {})) + env = item.get("env", {}) + name = clean_library_name(env.get("pname") or env.get("name") or Path(drv).name.removesuffix(".drv")) + return { + "name": name, + "version": env.get("version"), + "homepage": env.get("homepage") or env.get("meta.homepage"), + "description": env.get("meta.description") or env.get("description"), + "source_link": source_from_env(env), + "language": infer_language(name, env), + } + + +def clean_library_name(name: str) -> str: + match = STORE_HASH_PREFIX_RE.match(name) + if match: + name = match.group("name") + for suffix in (".nupkg", ".tar.gz", ".tar.xz", ".zip", ".drv"): + if name.endswith(suffix): + name = name[: -len(suffix)] + return name + + +def source_from_env(env: dict[str, str]) -> str | None: + for key in ("src", "urls", "url", "cargoDeps", "npmDeps", "goModules"): + val = env.get(key) + if val and ("http" in val or "github" in val): + return val + for key, val in env.items(): + if key.lower().endswith("url") and val and ("http" in val or "github" in val): + return val + return None + + +def infer_language(name: str, env: dict[str, str]) -> str | None: + text = " ".join([name, env.get("nativeBuildInputs", ""), env.get("buildInputs", "")]).lower() + if "python" in text or name.startswith("python"): + return "Python" + if "cargo" in text or "rustc" in text: + return "Rust" + if "go" in text and ("gomod" in text or "goModules" in env): + return "Go" + if "node" in text or "npm" in text or "pnpm" in text or "yarn" in text: + return "JavaScript/TypeScript" + if "cmake" in text or "gcc" in text or "clang" in text: + return "C/C++" + if name.startswith(("qt", "k", "lib")): + return "C/C++" + return None + + +def github_repo(*values: str | None) -> str | None: + for value in values: + if not value: + continue + match = GITHUB_RE.search(value) + if match: + return f"{match.group('owner')}/{match.group('repo')}" + return None + + +def noisy_for_review(row: dict[str, Any]) -> bool: + name = row["library"].lower() + drv_path = row.get("drv_path", "").lower() + if ".nupkg" in drv_path and not row.get("version_in_use"): + return True + noisy_exact = { + "bash", + "coreutils", + "coreutils-full", + "stdenv-linux", + "install-shell-files", + "version-check-hook", + "writable-tmpdir-as-home-hook", + "auto-patchelf-hook", + "pkg-config-wrapper", + "gcc-wrapper", + "gnumake", + "cmake", + "ninja", + "patchelf", + "remove-references-to", + "strip-nondeterminism", + } + if name in noisy_exact: + return True + noisy_bits = ( + "-source", + "source-", + "-go-modules", + "builder.sh", + "setup-hook", + "-hook", + ".patch", + ".diff", + "testdata", + "fixture", + ) + return any(bit in name for bit in noisy_bits) + + +def github_json(path: str) -> dict[str, Any] | None: + req = urllib.request.Request( + f"https://api.github.com/{path}", + headers={"Accept": "application/vnd.github+json", "User-Agent": "dotfiles-analysis"}, + ) + try: + with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as res: + return json.loads(res.read().decode()) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError): + return None + + +def http_json(url: str) -> dict[str, Any] | None: + req = urllib.request.Request( + url, + headers={"Accept": "application/json", "User-Agent": "dotfiles-analysis"}, + ) + try: + with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as res: + return json.loads(res.read().decode()) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, json.JSONDecodeError): + return None + + +def normalize_repo_url(value: str | None) -> str | None: + if not value: + return None + value = value.strip() + if value.startswith("git+"): + value = value[4:] + if value.startswith("git://github.com/"): + value = "https://github.com/" + value.removeprefix("git://github.com/") + if value.startswith("git@github.com:"): + value = "https://github.com/" + value.removeprefix("git@github.com:") + if value.endswith(".git"): + value = value[:-4] + return value + + +def parse_ecosystem(row: dict[str, Any]) -> tuple[str | None, str | None, str | None]: + name = row["library"] + version = row.get("version_in_use") or None + drv = row.get("drv_path", "") + if ".nupkg" in drv or row["root_name"].lower() in ("jellyfin", "sonarr", "radarr", "prowlarr"): + # The derivation rows have clean name/version; the raw .nupkg rows are + # filtered from review but can still be enriched in summary/deps. + if not version and ".nupkg" in drv: + base = clean_library_name(Path(drv).name.removesuffix(".drv")) + m = re.match(r"(.+)\.(\d+(?:\.\d+)+(?:[-.][0-9A-Za-z]+)*)$", base) + if m: + name, version = m.group(1), m.group(2) + return "nuget", name, version + if row["root_name"] in {"nix-serve"} and "perl5." in drv: + return "cpan", name, version + if name.startswith("python") or "python" in drv: + py_name = re.sub(r"^python\d+(?:\.\d+)?-", "", name) + return "pypi", py_name, version + if "node_modules" in row.get("dependency_path", "") or row["root_name"] in {"karakeep", "uptime-kuma"}: + return "npm", name, version + if "cargo" in name.lower() or "rust" in drv.lower(): + return "crates", name, version + return None, None, None + + +def release_date_from_pypi(files: list[dict[str, Any]]) -> str | None: + dates = [f.get("upload_time_iso_8601") for f in files if f.get("upload_time_iso_8601")] + return min(dates) if dates else None + + +def enrich_ecosystem(ecosystem: str, package: str, version: str | None, cache: dict[str, Any]) -> dict[str, Any]: + key = f"{ecosystem}:{package}:{version or ''}" + if key in cache: + return cache[key] + result: dict[str, Any] = {"ecosystem": ecosystem} + quoted = urllib.parse.quote(package, safe="") + + if ecosystem == "nuget": + result["language"] = "C#" + if version: + data = http_json(f"https://api.nuget.org/v3/registration5-semver1/{package.lower()}/{version.lower()}.json") + entry = (data or {}).get("catalogEntry", {}) + if isinstance(entry, str): + entry = http_json(entry) or {} + repo = entry.get("repository") or {} + repo_url = repo.get("url") if isinstance(repo, dict) else None + repo_url = normalize_repo_url(repo_url or entry.get("repositoryUrl") or entry.get("projectUrl")) + result.update( + { + "source_link": repo_url or entry.get("projectUrl"), + "release_date": entry.get("published"), + } + ) + index = http_json(f"https://api.nuget.org/v3-flatcontainer/{package.lower()}/index.json") + versions = (index or {}).get("versions") or [] + if versions: + result["latest_version"] = versions[-1] + + elif ecosystem == "npm": + data = http_json(f"https://registry.npmjs.org/{quoted}") or {} + info = data.get("versions", {}).get(version or "", {}) if version else {} + repo = info.get("repository") or data.get("repository") or {} + repo_url = repo.get("url") if isinstance(repo, dict) else repo + latest = (data.get("dist-tags") or {}).get("latest") + result.update( + { + "source_link": normalize_repo_url(repo_url) or data.get("homepage"), + "latest_version": latest, + "release_date": (data.get("time") or {}).get(version or ""), + "latest_release_date": (data.get("time") or {}).get(latest or ""), + "language": "JavaScript/TypeScript", + } + ) + + elif ecosystem == "pypi": + data = http_json(f"https://pypi.org/pypi/{quoted}/json") or {} + info = data.get("info", {}) + urls = info.get("project_urls") or {} + source = urls.get("Source") or urls.get("Source Code") or urls.get("Homepage") or info.get("home_page") or info.get("package_url") + latest = info.get("version") + result.update( + { + "source_link": normalize_repo_url(source), + "latest_version": latest, + "release_date": release_date_from_pypi((data.get("releases") or {}).get(version or "", [])), + "latest_release_date": release_date_from_pypi((data.get("releases") or {}).get(latest or "", [])), + "language": "Python", + } + ) + + elif ecosystem == "crates": + data = http_json(f"https://crates.io/api/v1/crates/{quoted}") or {} + crate = data.get("crate", {}) + result.update( + { + "source_link": normalize_repo_url(crate.get("repository") or crate.get("homepage")), + "latest_version": crate.get("max_stable_version") or crate.get("newest_version"), + "latest_release_date": crate.get("updated_at"), + "language": "Rust", + } + ) + + elif ecosystem == "cpan": + dist = package.replace("::", "-") + result.update( + { + "source_link": f"https://metacpan.org/pod/{package}", + "language": "Perl", + } + ) + data = http_json(f"https://fastapi.metacpan.org/v1/release/{urllib.parse.quote(dist, safe='')}") or {} + resources = ((data.get("metadata") or {}).get("resources") or {}) + repo = resources.get("repository") or {} + repo_url = repo.get("url") if isinstance(repo, dict) else repo + result.update( + { + "source_link": normalize_repo_url(repo_url) or result["source_link"], + "latest_version": data.get("version"), + "latest_release_date": data.get("date"), + } + ) + + result["github_repo"] = github_repo(result.get("source_link")) + cache[key] = result + return result + + +def enrich_github(repo: str, cache: dict[str, Any], sleep: float) -> dict[str, Any]: + if repo in cache: + return cache[repo] + data = github_json(f"repos/{repo}") or {} + if sleep: + time.sleep(sleep) + latest = github_json(f"repos/{repo}/releases/latest") or {} + if sleep: + time.sleep(sleep) + result = { + "github_repo": repo, + "github_stars": data.get("stargazers_count"), + "language": data.get("language"), + "source_link": data.get("html_url"), + "latest_version": latest.get("tag_name"), + "latest_release_date": latest.get("published_at"), + } + cache[repo] = result + return result + + +def walk_deps(root: dict[str, Any], all_drvs: dict[str, Any], max_depth: int) -> list[dict[str, Any]]: + start = root.get("drv") + if not start: + return [] + start_key = Path(start).name + rows = [] + seen = {start_key} + queue = deque([(start_key, [], 0)]) + while queue: + drv, path, depth = queue.popleft() + if depth >= max_depth: + continue + item = all_drvs.get(drv, {}) + input_drvs = item.get("inputDrvs") or (item.get("inputs") or {}).get("drvs") or {} + for dep_drv in sorted(input_drvs.keys()): + dep_key = Path(dep_drv).name + if dep_key in seen: + continue + seen.add(dep_key) + meta = drv_meta(dep_key, all_drvs) + dep_path = path + [meta["name"]] + rows.append( + { + "host": root["host"], + "root_kind": root["kind"], + "root_name": root["rootName"], + "root_package": root["packageName"], + "library": meta["name"], + "version_in_use": meta["version"], + "dep_depth": depth + 1, + "dependency_path": " -> ".join([root["rootName"]] + dep_path), + "drv_path": dep_key, + "homepage": meta["homepage"], + "source_link": meta["source_link"], + "language": meta["language"], + "github_repo": github_repo(meta["homepage"], meta["source_link"]), + "github_stars": None, + "ecosystem": None, + "release_date": None, + "latest_version": None, + "latest_release_date": None, + } + ) + queue.append((dep_key, dep_path, depth + 1)) + return rows + + +def write_csv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None: + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--max-roots", type=int, default=18) + parser.add_argument("--max-depth", type=int, default=2) + parser.add_argument("--github-limit", type=int, default=80) + parser.add_argument("--github-sleep", type=float, default=0.1) + parser.add_argument("--ecosystem-limit", type=int, default=400) + args = parser.parse_args() + + OUT.mkdir(exist_ok=True) + roots = eval_roots() + selected = [r for r in roots if r.get("drv")][: args.max_roots] + + dep_rows: list[dict[str, Any]] = [] + for root in selected: + print(f"walking {root['host']}:{root['rootName']} {root['packageName']}", file=sys.stderr) + try: + all_drvs = derivation_show_recursive(root["drv"]) + except RuntimeError as exc: + print(exc, file=sys.stderr) + continue + dep_rows.extend(walk_deps(root, all_drvs, args.max_depth)) + + ecosystem_cache_path = OUT / "ecosystem-cache.json" + ecosystem_cache = json.loads(ecosystem_cache_path.read_text()) if ecosystem_cache_path.exists() else {} + ecosystem_keys = [] + ecosystem_rows: dict[tuple[str | None, str | None, str | None], list[dict[str, Any]]] = {} + for row in dep_rows: + ecosystem, package, version = parse_ecosystem(row) + if ecosystem and package: + key = (ecosystem, package, version) + ecosystem_rows.setdefault(key, []).append(row) + if key not in ecosystem_keys: + ecosystem_keys.append(key) + for idx, (ecosystem, package, version) in enumerate(ecosystem_keys[: args.ecosystem_limit], start=1): + if idx % 25 == 1: + print(f"enriching ecosystem metadata {idx}/{min(len(ecosystem_keys), args.ecosystem_limit)}", file=sys.stderr) + meta = enrich_ecosystem(ecosystem, package, version, ecosystem_cache) + for row in ecosystem_rows.get((ecosystem, package, version), []): + row.update({k: v for k, v in meta.items() if v is not None and (not row.get(k) or k in {"ecosystem", "release_date"})}) + if idx % 25 == 0: + write_json_atomic(ecosystem_cache_path, ecosystem_cache) + write_json_atomic(ecosystem_cache_path, ecosystem_cache) + + cache_path = OUT / "github-cache.json" + cache = json.loads(cache_path.read_text()) if cache_path.exists() else {} + repos = [] + for row in dep_rows: + repo = row.get("github_repo") + if repo and repo not in repos: + repos.append(repo) + for idx, repo in enumerate(repos[: args.github_limit], start=1): + if idx % 25 == 1: + print(f"enriching GitHub metadata {idx}/{min(len(repos), args.github_limit)}", file=sys.stderr) + gh = enrich_github(repo, cache, args.github_sleep) + for row in dep_rows: + if row.get("github_repo") == repo: + row.update({k: v for k, v in gh.items() if v is not None}) + if idx % 25 == 0: + write_json_atomic(cache_path, cache) + + for root in roots: + repo = github_repo(root.get("homepage"), " ".join(root.get("sourceUrls") or [])) + root["github_repo"] = repo + root["github_stars"] = None + root["ecosystem"] = "nix" + root["release_date"] = None + root["latest_version"] = None + root["latest_release_date"] = None + root["language"] = None + if repo: + gh = enrich_github(repo, cache, args.github_sleep) + root.update({k: v for k, v in gh.items() if v is not None}) + write_json_atomic(cache_path, cache) + + root_fields = [ + "priority", + "host", + "kind", + "rootName", + "packageName", + "pname", + "version", + "drv", + "storePath", + "homepage", + "description", + "sourceUrls", + "image", + "github_repo", + "github_stars", + "ecosystem", + "release_date", + "latest_version", + "latest_release_date", + "language", + ] + dep_fields = [ + "host", + "root_kind", + "root_name", + "root_package", + "library", + "version_in_use", + "dep_depth", + "dependency_path", + "drv_path", + "homepage", + "source_link", + "github_repo", + "github_stars", + "ecosystem", + "release_date", + "latest_version", + "latest_release_date", + "language", + ] + write_csv(OUT / "network-package-roots.csv", roots, root_fields) + write_csv(OUT / "network-library-deps.csv", dep_rows, dep_fields) + + # One row per library, preserving the first root/path encountered. This is + # convenient for hand-reviewing uncommon deps before opening the full edge CSV. + summary: dict[str, dict[str, Any]] = {} + for row in dep_rows: + key = row["drv_path"] + summary.setdefault(key, row.copy()) + write_csv( + OUT / "network-library-summary.csv", + sorted(summary.values(), key=lambda r: (r.get("github_stars") is not None, r.get("github_stars") or 0, r["library"])), + dep_fields, + ) + review_rows = [r for r in summary.values() if not noisy_for_review(r)] + write_csv( + OUT / "network-library-review.csv", + sorted(review_rows, key=lambda r: (r.get("github_stars") is not None, r.get("github_stars") or 0, r["library"])), + dep_fields, + ) + print(f"wrote {len(roots)} roots and {len(dep_rows)} dependency rows", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())